Assess the presence and abundance of Gardnerella spp. and their impacts on the vaginal microbiomes of pregnant women from three distinct cohorts.

Set up

Load Packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.5     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.4     ✔ stringr 1.4.0
## ✔ readr   2.0.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(kableExtra)
library(broom)
library(formattable)
library(reshape2)
library(vegan)
library(ggbeeswarm)
library(ggExtra)
 library(coin)
library(cowplot)
library(rstatix)
library(ggpubr)
`%!in%` <- negate(`%in%`)
set.seed(7334)

Define file paths

figureOut <- "../FIGURES/submission_manuscript_figures"

tmpFigureOut <- "../FIGURES/submission_manuscript_figures/tmp" # folder for storing temp files for figure formatting

suDF <- "./sumgsDF.tsv" %>% # metadata for Stanford and UAB
  read_tsv %>%
  mutate(SampleID=as.character(SampleID))

hmp2DF <- "./hmp2mgsDF.tsv" %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID),
         Cohort="MOMS-PI")

# sample data
sgDF0 <- suDF %>%
  select("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs") %>%
  mutate(Cohort=paste(Cohort, "Enriched")) %>%
  full_join(hmp2DF[,c("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs")], by = c("SampleID", "SubjectID", "Cohort", "sampleTrimester", "GDcoll", "GWdel", "Age", "Income", "Education", "deliveryMode", "Race", "Ethnicity", "TermPre", "TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs")) %>%
  mutate(Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI")), # Stanford and UAB samples were selected for enrichment in Gardnerella, label them has Enriched.
         SampleID=as.character(SampleID), 
         SubjectID=as.character(SubjectID),
         pctHuman=((Sickle_pairs-bbmapFiltered_pairs)/Sickle_pairs)*100, # calculate percent human reads
         bacLoad=bbmapFiltered_pairs/(Sickle_pairs-bbmapFiltered_pairs)) # calculate microbial load

Taxa abundance tables Stanford and UAB cohorts:

suMetaphlanPath <- "../humann2/merged_tables/mergedMetaphlanAbundance.tsv"
suGardCladePath <- "./gardCladeRelativeAbundance.tsv"
suGardGenomoPath <- "./gardGenomospeciesRelativeAbundance.tsv"

MOMS-PI (HMP2) cohort

hmp2MetaphlanPath <- "../humann2/merged_tables/hmp2mgs_mergedMetaphlanAbundance.tsv"
hmp2GardCladePath <- "./hmp2gardCladeRelativeAbundance.tsv"
hmp2GardGenomoPath <- "./hmp2gardGenomospeciesRelativeAbundance.tsv"

ggplot settings

Themes:

theme_set(theme_bw()+
          theme(panel.grid = element_blank(),
                axis.text = element_text(size=10),
                axis.title = element_text(size=15),
                legend.text = element_text(size=12),
                legend.title = element_text(size=14)))

Clade colors:

cladeColors <-  list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73"), labels=c("C1", "C2", "C3", "C4", "C5", "C6")),
                     scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73"), labels=c("C1", "C2", "C3", "C4", "C5", "C6")))

cladeColorsUnch <-  list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)))),
                     scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)))))

cladeColorsTotal <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Total'~italic(Gardnerella)))),
                         scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Total'~italic(Gardnerella)))))

cladeColorsTotalUnch <- list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)),  bquote('Total'~italic(Gardnerella)))),
                         scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", "#A9A9A9", "#000000"), labels=c("C1", "C2", "C3", "C4", "C5", "C6", bquote('Uncharacterized'~italic(Gardnerella)),  bquote('Total'~italic(Gardnerella)))))

genomoColorsTotal <- list(scale_color_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Total'~italic(Gardnerella)))),
                     scale_fill_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Total'~italic(Gardnerella)))))


genomoColorsTotalUnch <- list(scale_color_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#A9A9A9", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Uncharacterized'~italic(Gardnerella)),
                              bquote('Total'~italic(Gardnerella)))),
                     scale_fill_manual(values=c("salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "cadetblue1", "blue4", "blue", "#E69F00", "#009E73", "#A9A9A9", "#000000"),
                     labels=c(bquote(italic("G. vaginalis")),
                              "GS2",
                              "GS3",
                              bquote(italic("G. piotti")),
                              "GS11",
                              "GS7",
                              "GS8",
                              "GS9",
                              "GS10",
                              "GS14",
                              bquote(italic("G. leopoldii")),
                              bquote(italic("G. swidsinskii")),
                              "GS12",
                              "GS13",
                              bquote('Uncharacterized'~italic(Gardnerella)),
                              bquote('Total'~italic(Gardnerella)))))

Gardnerella and Lactobacillus Colors:

gLColors <-   list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73","darkgreen", "khaki1", "darkblue", "darkred"), labels=c("G. vaginalis C1", "G. vaginalis C2", "G. vaginalis C3", "G. vaginalis C4", "G. vaginalis C5", "G. vaginalis C6", "L. crispatus", "L. gasseri", "L. iners", "L. jensenii")),
  scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73","darkgreen", "khaki1", "darkblue", "darkred", "darkseagreen4"), labels=c("G. vaginalis C1", "G. vaginalis C2", "G. vaginalis C3", "G. vaginalis C4", "G. vaginalis C5", "G. vaginalis C6", "L. crispatus", "L. gasseri", "L. iners", "L. jensenii")))

All Taxa Colors:

taxColors <-   list(scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", 
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"), 
                                        labels=c("C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
                                        )), 
                     scale_fill_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"), 
                                        labels=c("C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
                                        )))

taxColorsOther <-   list(scale_color_manual(values=c("#808080", "#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73", 
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"), 
                                        labels=c("Other", "C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
                                        )), 
                     scale_fill_manual(values=c("#808080", "#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73",
                                                 "darkgreen", "khaki1", "darkred", "darkblue",
                                                 "mediumpurple4", "#C42104",  "darkseagreen4", "#2004C2"), 
                                        labels=c("Other", "C1", "C2", "C3", "C4", "C5", "C6",
                                                 bquote(italic("L. crispatus")), bquote(italic("L. gasseri")), bquote(italic("L. jensenii")), bquote(italic("L.iners")),
                                                 bquote(italic("A. vaginae")), bquote(italic("M. hominis")), bquote(italic("P. bivia")), bquote(italic("U. parvum"))
                                        )))

Bi/Tri/Quad Colors:

binaryColors <- list(scale_color_manual(values = c("#D55E00", "#56B4E9")),
                     scale_fill_manual(values=c("#D55E00", "#56B4E9")))

trinaryColors <- list(scale_color_manual(values = c("#F77754", "#018790", "#0A516D")),
                      scale_fill_manual(values = c(c("#F77754", "#018790", "#0A516D"))))

quadColors <- list(scale_color_manual(values = c("#F77754", "#018790", "#0A516D", "#2B2726")),
                      scale_fill_manual(values = c(c("#F77754", "#018790", "#0A516D", "#2B2726")))) 

quadColors2 <- list(scale_color_manual(values = c("#D55E00", "#009E73", "#CC79A7", "#56B4E9")),
                     scale_fill_manual(values=c("#D55E00","#009E73", "#CC79A7", "#56B4E9")))

Other shortcuts

Create some shortcut lists for convenience.

## Other shortcuts
clades <- c("C1", "C2", "C3", "C4", "C5", "C6")
genomos <- c("GS1", "GS2", "GS3", "GS4", "GS5", "GS6", "GS7", "GS8", "GS9", "GS10", "GS11", "GS12", "GS13", "GS14")
genomoNames <- c("G. vag.", "GS2", "GS3", "G. pio.", "G. leo.", "G. swi.", "GS7", "GS8", "GS9", "GS10", "GS11", "GS12", "GS13", "GS14")
genomosCladeOrder <- c("GS1", "GS2", "GS3", "GS4", "GS11", "GS7", "GS8", "GS9", "GS10", "GS14", "GS5", "GS6", "GS12", "GS13")
genomoNamesCladeOrder <- c("G. vag.", "GS2", "GS3", "G. pio.", "GS11", "GS7", "GS8", "GS9", "GS10", "GS14", "G. leo.", "G. swi.", "GS12", "GS13")

lactos <- c("Lactobacillus_crispatus", "Lactobacillus_gasseri", "Lactobacillus_jensenii", "Lactobacillus_iners")
anaerobes <- c("Atopobium_vaginae", "Finegoldia_magna",  "Mycoplasma_hominis", "Megasphaera_genomosp_type_1", "Megasphaera_unclassified", "Prevotella_amnii", "Prevotella_bivia", "Prevotella_timonensis",  "Ureaplasma_parvum")
anaerobesSL <- c("Atopobium_vaginae", "Mycoplasma_hominis", "Prevotella_bivia", "Ureaplasma_parvum") # short list of anaerobes

abbrevs <- c("TG", clades, "Lc", "Lg", "Lj", "Li", "Av", "Fm", "Mh", "Meg1", "Megu", "Pa", "Pb", "Pt", "Up")
abbrevsSL <- c("TG", clades, "Lc", "Lg", "Lj", "Li",  "Av", "Mh", "Pb", "Up")  # short list of abbreviations

cohorts <- c("Stanford Enriched", "UAB Enriched", "MOMS-PI Enriched", "MOMS-PI")
cohortAbbrevs <- c("Stanford Enr.", "UAB Enr.", "MOMS-PI Enr.", "MOMS-PI")

Remove low-read samples

Starting number of samples and subjects:

sgDF0 %>%
  group_by(Cohort) %>%
  summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
  formattable()
Cohort N Subjects N Samples
Stanford Enriched 20 62
UAB Enriched 15 45
MOMS-PI 234 930

Library sizes

Paired reads in total library size, after quality filtering by Sickle, and after removing human reads.

sgDF0 %>%
  gather("Step", "paired_reads", c(TotalPairs, Sickle_pairs, bbmapFiltered_pairs)) %>%
  mutate(Step=factor(Step, levels=c("TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs"), labels=c("Library Size", "Sickle Filtered", "Microbial Reads"))) %>%
  group_by(Step, Cohort) %>%
  summarise(min=min(paired_reads), mean=mean(paired_reads), max=max(paired_reads)) %>%
  mutate_if(is.numeric, ~prettyNum(.x, big.mark = ",")) %>%
  formattable()
Step Cohort min mean max
Library Size Stanford Enriched 14,655,421 19,681,739 29,263,108
Library Size UAB Enriched 13,081,280 19,239,013 35,557,864
Library Size MOMS-PI 12,210 10,162,162 89,647,588
Sickle Filtered Stanford Enriched 12,757,737 17,122,069 24,968,647
Sickle Filtered UAB Enriched 11,695,954 16,964,926 30,690,425
Sickle Filtered MOMS-PI 0 6,877,062 78,549,522
Microbial Reads Stanford Enriched 45,287 2,108,541 17,576,674
Microbial Reads UAB Enriched 81,037 3,811,026 15,652,482
Microbial Reads MOMS-PI 0 669,755.7 30,053,898

Percent filtered at each step

sgDF0 %>%
  mutate(`Sickle % Total`=Sickle_pairs/TotalPairs,
         `BBmap % Sickle`=bbmapFiltered_pairs/Sickle_pairs,
         `BBmap % Total`=bbmapFiltered_pairs/TotalPairs) %>%
  group_by(Cohort) %>%
  summarise_at(c("Sickle % Total",  "BBmap % Sickle", "BBmap % Total"), ~median(.x, na.rm=TRUE)) %>%
  formattable()
Cohort Sickle % Total BBmap % Sickle BBmap % Total
Stanford Enriched 0.8685448 0.04800637 0.04134656
UAB Enriched 0.8793049 0.20023969 0.17588516
MOMS-PI 0.7215741 0.04755454 0.02472847

sickle_pct_total = percent of paired reads left after sickle filtering bbmap_pct_sicke = percent of sickle paired reads left after removing human reads bbmap_pct_total = percent of all paired reads left after sickle filtering and removing human reads

View number of paired reads after sickle and human filtering per sample and determine filtering threshold.

p <- sgDF0 %>%
  ggplot(aes(x=Sickle_pairs, y=bbmapFiltered_pairs, fill=Cohort, color=Cohort)) +
  geom_point(alpha=0.25) +
  trinaryColors +
  geom_vline(xintercept = 1e6, color="grey", linetype="dashed") + # 1 million paired reads after sickle
  scale_x_log10(n.breaks=10) +
  scale_y_log10(n.breaks=10) +
  theme_bw() +
  theme(legend.position = "bottom",
        axis.text = element_text(size=10),
        axis.title = element_text(size=15),
        legend.text = element_text(size=12),
        legend.title = element_text(size=14)) +
  labs(x="QC Filtered Paired Reads", y="Microbial Paired Reads")
ggMarginal(p, groupFill = TRUE, type="density")

Remove the samples with fewer than 1 million paired reads after Sickle filtering and trimming.

QCfilterSamples <- sgDF0 %>%
  filter(Sickle_pairs>1e6) %>% # list of samples that pass QC filtering criteria and will be used in analyses
  .$SampleID
length(QCfilterSamples) # number of samples that will be used in analyses
## [1] 888
sgDF1 <- sgDF0 %>%
  filter(SampleID %in% QCfilterSamples)

Number of samples remaining in each cohort:

sgDF1 %>%
  group_by(Cohort) %>%
  summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
  formattable()
Cohort N Subjects N Samples
Stanford Enriched 20 62
UAB Enriched 15 45
MOMS-PI 231 781

Will proceed with a total of 888 samples, 62 from Stanford, 45 from UAB, and 781 from MOMS-PI

Library sizes and percent filtered at each step post QC sample removal:

readCountsPlot <- sgDF1 %>%
  gather("Step", "paired_reads", c(TotalPairs, Sickle_pairs, bbmapFiltered_pairs)) %>%
  mutate(Step=factor(Step, levels=c("TotalPairs", "Sickle_pairs", "bbmapFiltered_pairs"), labels=c("Library Size", "Sickle Filtered", "Microbial Reads")),
         Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI"), labels=c("Sanford\nEnriched", "UAB\nEnriched", "MOMS-PI"))) %>%
  ggplot(aes(x=Cohort, y=paired_reads, fill=Cohort, color=Cohort)) +
  geom_quasirandom(dodge.width = 0.9, alpha=0.5, size=1, show.legend = FALSE) +
  geom_boxplot(alpha=0, position=position_dodge(width=0.9), color="black", show.legend = FALSE) +
  scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
  facet_wrap(~Step) +
  trinaryColors +
  theme_bw() +
  theme(axis.text.x = element_text(size=10),
        axis.text.y=element_text(size=13),
        strip.text = element_text(size=13),
        axis.title = element_text(size=16), 
        aspect.ratio = 1.5) +
  labs(x=NULL, y="Paired Reads")

Percent Human Reads

percentHumanHists <- sgDF1 %>%
  group_by(Cohort) %>%
  ggplot(aes(pctHuman)) +
  geom_histogram(aes(y=stat(width*density))) +
  scale_y_continuous(limits = c(0,0.4), n.breaks=9) +
  facet_wrap(~Cohort) +
  theme(axis.text = element_text(size=13),
        axis.title = element_text(size=16),
        strip.text = element_text(size=13),
        aspect.ratio = 1.5) +
  labs(x="Percent Human Reads", y="Proportion of Samples")

MOMS-PI samples had greatest percent of human reads overall. They were not selected for enrichment in Gardnerella. Samples in UAB cohort had the smallest percent of human reads.

Summary table of library sizes and percent human reads of QC filtered samples

readsTable <- sgDF1 %>%
  group_by(Cohort) %>%
  summarise_at(vars(TotalPairs, Sickle_pairs, bbmapFiltered_pairs, pctHuman), list(mean=mean, sd=sd)) %>%
  mutate_at(vars(pctHuman_mean, pctHuman_sd), ~round(.x, 2)) %>%
  mutate_if(is.numeric, ~prettyNum(.x, big.mark = ",")) %>%
  transmute(Cohort=Cohort,
            `Library Size`=paste0(TotalPairs_mean, " (", TotalPairs_sd, ")"),
            `Sickle Filtered`=paste0(Sickle_pairs_mean, " (", Sickle_pairs_sd, ")"),
            `Microbial Reads`=paste0(bbmapFiltered_pairs_mean, " (", bbmapFiltered_pairs_sd, ")"),
            `Percent Human Reads`=paste0(pctHuman_mean, " (", pctHuman_sd, ")")) %>%
  kbl() %>%
  kable_classic(full_width=TRUE, html_font = "Arial")

#save table to tmp file for adding to rest of read count figure

readsTablePngPath <- file.path(tmpFigureOut, paste(Sys.Date(), "readsTableComponent.png", sep="_"))

readsTable %>%
  save_kable(file = readsTablePngPath, zoom=4)

Figure S3: Read counts and percent human reads

readsAB <- plot_grid(readCountsPlot, percentHumanHists, nrow = 1, align = "hv", axis="tb", labels = c("A", "B"), label_size = 15)

printReadsTable <- ggdraw() + draw_image(readsTablePngPath)

plot_grid(readsAB, printReadsTable, nrow=2, rel_heights = c(1, 0.25), align = "v", axis = "l", labels = c("", "C"), label_size = 15)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS3_ReadCounts.png", sep="_")))

Metadata

Description of cohorts and sample collection schemes

Figure S2: Gestational age at collection

samplingSched <- sgDF1 %>%
  filter(!is.na(GDcoll)) %>%
  mutate(GWcoll=GDcoll/7,
         GWcoll=floor(GWcoll),
         Cohort=factor(Cohort, levels=c("Stanford Enriched", "UAB Enriched", "MOMS-PI"))) %>%
  split(.$Cohort) %>%
  map(., ~ggplot(.x) +
    geom_point(aes(x=ceiling(GWcoll), y=SubjectID), size=1, alpha=0.5) +
    geom_point(aes(x=(GWdel+0.5), y=SubjectID), shape=")") +
    facet_grid(.~Cohort) +
    theme(axis.text = element_text(size=13),
        axis.title = element_text(size=16),
        strip.text = element_text(size=13),
        aspect.ratio = 1.5) +
    labs(x=NULL, y=NULL))

stanfordSamplingSched <- samplingSched$`Stanford Enriched` +
  labs(y="Subject ID")

uabSamplingSched <-  samplingSched$`UAB Enriched` +
  labs(x="Gestation Weeks")

momspiSamplingSched <- samplingSched$`MOMS-PI` +
  theme(axis.text.y = element_text(size=3))

plot_grid(stanfordSamplingSched, uabSamplingSched, momspiSamplingSched, rel_widths = c(1, 1, 1), rel_heights = c(1, 1, 1), align = "hv", nrow = 1) 

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS2_samplingSchedule.png", sep="_")))
sgDF1 %>%
  filter(!is.na(GDcoll)) %>%
  mutate(GWcoll=GDcoll/7,
         GWcoll=floor(GWcoll)) %>%
  group_by(Cohort) %>%
  summarize(across(GWcoll, list(median=median, `25th`=~quantile(.x, 0.25), `75th`=~quantile(.x, 0.75))))
## # A tibble: 3 × 4
##   Cohort            GWcoll_median GWcoll_25th GWcoll_75th
##   <fct>                     <dbl>       <dbl>       <dbl>
## 1 Stanford Enriched            22          16          31
## 2 UAB Enriched                 26          19          32
## 3 MOMS-PI                      29          20          36

Get MOMS-PI subjects with gestational age at delivery

hmp2SubjectsWDel <- sgDF1 %>%
  filter(Cohort=="MOMS-PI",
         !is.na(TermPre)) %>%
  .$SubjectID %>%
  unique

head(hmp2SubjectsWDel)
## [1] "2442683" "2442687" "2442892" "2442905" "2442618" "2443013"
length(hmp2SubjectsWDel)
## [1] 133

Table 2: Demographic and Clinical data

#GAAD 
gaadDF <- sgDF1 %>%
  select(SubjectID, Cohort, TermPre, GWdel) %>%
  unique() %>%
  add_count(Cohort, TermPre) %>%
  group_by(Cohort, TermPre, n) %>%
  summarize(mean=round(mean(GWdel, na.rm=TRUE), 1), 
            sd=round(sd(GWdel, na.rm = TRUE), 1)) %>%
  replace_na(list(mean="-",
                  sd="-")) %>%
  ungroup() %>%
  mutate(TermPre=case_when(TermPre=="T"~"Term",
                           TermPre=="P"~"Preterm",
                           is.na(TermPre)~"Unknown"),
         cat=paste0(Cohort, "\n", TermPre, " (n=", n, ")"),
         `Mean gestational age in weeks at delivery (±SD)`=paste0(mean, " (", sd, ")")) %>%
  replace_na(list(`Mean gestational age in weeks at delivery (±SD)`="-")) %>%
  select(cat, `Mean gestational age in weeks at delivery (±SD)`) %>%
  column_to_rownames(var="cat") %>%
  t %>%
  as.data.frame %>%
  rownames_to_column(var="x") %>%
  select("x", "Stanford Enriched\nTerm (n=11)", "Stanford Enriched\nPreterm (n=9)", "UAB Enriched\nTerm (n=5)", "UAB Enriched\nPreterm (n=10)", "MOMS-PI\nTerm (n=90)", "MOMS-PI\nPreterm (n=43)", "MOMS-PI\nUnknown (n=98)")

# Demographics
demDF <- sgDF1 %>%
  select(SubjectID, Cohort, TermPre, Age, Race, Ethnicity, Education, Income, deliveryMode) %>%
  unique() %>%
  mutate(Cohort=as.character(Cohort)) %>%
  add_count(Cohort, TermPre, name="term_count") %>%
  mutate(TermPre=case_when(TermPre=="T"~"Term",
                           TermPre=="P"~"Preterm",
                           is.na(TermPre)~"Unknown"),
         cat=paste0(Cohort, "\n", TermPre, " (n=", term_count, ")")) %>%
  select(-SubjectID, -TermPre, -Cohort, -term_count) %>%
  map2_df(., data.frame(matrix(rep(.$cat, 7), ncol=7)), ~count(data.frame(x=.x, y=.y), x, y), .id="z") %>%
  filter(z!="cat") %>%
  group_by(z,y) %>%
  mutate(n=paste0(n, " (", round(n/sum(n)*100, 0), "%)")) %>%
  ungroup %>%
  spread(y, n) %>%
  mutate_all(~replace_na(.x, "0 (0%)")) %>%
  mutate(x=paste(x,z, sep="_"), #add variable category for arranging purposes because "unknown" appears multiple times and therefore causes an error when making x a factor class.
         x=factor(x, c("Below 18_Age", "18 to 28_Age", "29 to 38_Age", "Above 38_Age", "Unknown_Age", "Asian_Race", "Black_Race", "White_Race", "Other_Race", "Hispanic_Ethnicity", "Non-Hispanic_Ethnicity", "Unknown_Ethnicity", "Less than high school_Education", "High school diploma or GED_Education", "Some college_Education", "Bachelor or undergraduate degree_Education", "Post-undergraduate degree_Education", "Unknown_Education", "No income_Income", "Under $10,000_Income", "$10,000 - $30,000_Income", "$30,000 - $50,000_Income", "$50,000 - $60,000_Income", "$80,000 - $100,000_Income", "$150,000 - $200,000_Income", "$250,000 or more_Income", "Under $15,000_Income", "$15,000 - $19,999_Income", "$20,000 - $39,999_Income", "$40,000 - $59,999_Income", "$60,000 - $79,999_Income", "$80,000 or more_Income", "Unknown_Income", "Vaginal_deliveryMode", "Cesarean_deliveryMode", "Unknown_deliveryMode"))) %>%
  arrange(x) %>% # order rows
  mutate(x=str_extract(x, ".*(?=_)")) %>% # remove category label now that rows are arranged 
  mutate_at(c("Stanford Enriched\nTerm (n=11)", "Stanford Enriched\nPreterm (n=9)", "UAB Enriched\nTerm (n=5)", "UAB Enriched\nPreterm (n=10)"), # remove 0's from income rows where they don't belong due to different levels in Stanford/UAB vs. MOMS-PI for income
            ~case_when(x %in% c("Under $15,000", "$15,000 - $19,999", "$20,000 - $39,999", "$40,000 - $59,999", "$60,000 - $79,999", "$80,000 or more")~"-",
                       x %!in% c("Under $15,000", "$15,000 - $19,999", "$20,000 - $39,999", "$40,000 - $59,999", "$60,000 - $79,999", "$80,000 or more")~.x)) %>%
  mutate_at(c("MOMS-PI\nTerm (n=90)", "MOMS-PI\nPreterm (n=43)", "MOMS-PI\nUnknown (n=98)"), # remove 0's from income rows where they don't belong due to different levels in Stanford/UAB vs. MOMS-PI for income
            ~case_when(x %in% c("No income", "Under $10,000", "$10,000 - $30,000", "$30,000 - $50,000", "$50,000 - $60,000", "$80,000 - $100,000", "$150,000 - $200,000", "$250,000 or more")~"-",
                       x %!in% c("No income", "Under $10,000", "$10,000 - $30,000", "$30,000 - $50,000", "$50,000 - $60,000", "$80,000 - $100,000", "$150,000 - $200,000", "$250,000 or more")~.x)) %>% 
  select(x, `Stanford Enriched\nTerm (n=11)`, `Stanford Enriched\nPreterm (n=9)`, `UAB Enriched\nTerm (n=5)`, `UAB Enriched\nPreterm (n=10)`, `MOMS-PI\nTerm (n=90)`, `MOMS-PI\nPreterm (n=43)`, `MOMS-PI\nUnknown (n=98)`) # order columns

demTable <- gaadDF %>%
  rbind(demDF) %>%
  dplyr::rename("  "=x) %>%
  dplyr::rename_at(2:8, ~str_extract(.x, "(?<=\\n).*")) %>%
  kbl(caption = "Table 2. Sampling Cohorts") %>%
  kable_classic(full_width=TRUE, html_font = "Arial") %>%
  add_header_above(c(" ", "Stanford Enriched" = 2, "UAB Enriched" = 2, "MOMS-PI" = 3)) %>%
  pack_rows("Age", 2, 6) %>%
  pack_rows("Race", 7, 10) %>%
  pack_rows("Ethnicity", 11, 13) %>%
  pack_rows("Education", 14, 19) %>%
  pack_rows("Income", 20, 34) %>%
  pack_rows("Delivery Mode", 35, 37)
demTable
Table 2. Sampling Cohorts
Stanford Enriched
UAB Enriched
MOMS-PI
Term (n=11) Preterm (n=9) Term (n=5) Preterm (n=10) Term (n=90) Preterm (n=43) Unknown (n=98)
Mean gestational age in weeks at delivery (±SD) 39.2 (1.7) 33.9 (2.3) 37.8 (1.3) 32.7 (3.4) 40 (0.7) 34.1 (3.4)
  • (-)
Age
Below 18 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (2%) 1 (2%) 1 (1%)
18 to 28 2 (18%) 0 (0%) 4 (80%) 6 (60%) 57 (63%) 28 (65%) 54 (55%)
29 to 38 4 (36%) 6 (67%) 1 (20%) 3 (30%) 27 (30%) 12 (28%) 39 (40%)
Above 38 0 (0%) 0 (0%) 0 (0%) 0 (0%) 4 (4%) 2 (5%) 2 (2%)
Unknown 5 (45%) 3 (33%) 0 (0%) 1 (10%) 0 (0%) 0 (0%) 2 (2%)
Race
Asian 2 (18%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (2%)
Black 0 (0%) 0 (0%) 3 (60%) 8 (80%) 70 (78%) 30 (70%) 56 (57%)
White 6 (55%) 4 (44%) 0 (0%) 1 (10%) 14 (16%) 6 (14%) 30 (31%)
Other 3 (27%) 5 (56%) 2 (40%) 1 (10%) 6 (7%) 7 (16%) 10 (10%)
Ethnicity
Hispanic 3 (27%) 6 (67%) 1 (20%) 0 (0%) 6 (7%) 4 (9%) 4 (4%)
Non-Hispanic 5 (45%) 2 (22%) 4 (80%) 9 (90%) 84 (93%) 39 (91%) 93 (95%)
Unknown 3 (27%) 1 (11%) 0 (0%) 1 (10%) 0 (0%) 0 (0%) 1 (1%)
Education
Less than high school 1 (9%) 3 (33%) 3 (60%) 4 (40%) 6 (7%) 5 (12%) 11 (11%)
High school diploma or GED 1 (9%) 3 (33%) 1 (20%) 3 (30%) 38 (42%) 20 (47%) 27 (28%)
Some college 2 (18%) 1 (11%) 1 (20%) 2 (20%) 23 (26%) 9 (21%) 20 (20%)
Bachelor or undergraduate degree 4 (36%) 1 (11%) 0 (0%) 0 (0%) 17 (19%) 6 (14%) 24 (24%)
Post-undergraduate degree 2 (18%) 1 (11%) 0 (0%) 0 (0%) 4 (4%) 3 (7%) 14 (14%)
Unknown 1 (9%) 0 (0%) 0 (0%) 1 (10%) 2 (2%) 0 (0%) 2 (2%)
Income
No income 1 (9%) 0 (0%) 2 (40%) 0 (0%)
Under $10,000 0 (0%) 0 (0%) 2 (40%) 4 (40%)
$10,000 - $30,000 2 (18%) 4 (44%) 1 (20%) 5 (50%)
$30,000 - $50,000 1 (9%) 1 (11%) 0 (0%) 0 (0%)
$50,000 - $60,000 2 (18%) 1 (11%) 0 (0%) 0 (0%)
$80,000 - $100,000 2 (18%) 0 (0%) 0 (0%) 0 (0%)
$150,000 - $200,000 1 (9%) 0 (0%) 0 (0%) 0 (0%)
$250,000 or more 1 (9%) 1 (11%) 0 (0%) 0 (0%)
Under $15,000
58 (64%) 27 (63%) 46 (47%)
$15,000 - $19,999
8 (9%) 3 (7%) 7 (7%)
$20,000 - $39,999
11 (12%) 7 (16%) 8 (8%)
$40,000 - $59,999
4 (4%) 2 (5%) 9 (9%)
$60,000 - $79,999
0 (0%) 1 (2%) 8 (8%)
$80,000 or more
4 (4%) 1 (2%) 15 (15%)
Unknown 1 (9%) 2 (22%) 0 (0%) 1 (10%) 5 (6%) 2 (5%) 5 (5%)
Delivery Mode
Vaginal 5 (45%) 1 (11%) 5 (100%) 8 (80%) 71 (79%) 38 (88%) 76 (78%)
Cesarean 5 (45%) 8 (89%) 0 (0%) 2 (20%) 16 (18%) 3 (7%) 14 (14%)
Unknown 1 (9%) 0 (0%) 0 (0%) 0 (0%) 3 (3%) 2 (5%) 8 (8%)
#save_kable(demTable, file = file.path(figureOut, paste(Sys.Date(), "Table2_demTable.png", sep="_")), zoom=5)

Set up taxa data frames

Organize and set up tables from MetaPhlAn2 and Gardnerella mapping method. ## MetaPhlAn2 Species relative abundances for Stanford and UAB cohorts:

sumDF <- suMetaphlanPath %>%
  read_tsv() %>%
  filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
  mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -ID) %>%
  dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{10}")) %>%
  column_to_rownames(var="Species") %>%
  t %>%
  as.data.frame() %>%
  mutate_all(funs(./100)) %>%
  rownames_to_column(var="SampleID")
sumDF[1:6,1:5]
##     SampleID Actinobaculum_massiliense Actinobaculum_schaalii
## 1 1000801248                         0                      0
## 2 1000801318                         0                      0
## 3 1000801368                         0                      0
## 4 1001301158                         0                      0
## 5 1001301248                         0                      0
## 6 1001301318                         0                      0
##   Actinobaculum_urinale Actinomyces_europaeus
## 1                     0                     0
## 2                     0                     0
## 3                     0                     0
## 4                     0                     0
## 5                     0                     0
## 6                     0                     0

Species relative abundances for MOMS-PI cohort:

hmp2mDF <- hmp2MetaphlanPath %>%
  read_tsv %>%
  filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
  mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -ID) %>%
  as.data.frame() %>%
  dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{7}")) %>%
  column_to_rownames(var="Species") %>%
  t %>%
  as.data.frame() %>%
  mutate_all(funs(./100)) %>%
  rownames_to_column(var="SampleID") %>%
  filter(SampleID %in% QCfilterSamples) # keep only samples that pass quality filtering
hmp2mDF[1:6,1:5]
##   SampleID Methanobrevibacter_smithii Actinobaculum_massiliense
## 1  6743909                          0                         0
## 2  6743911                          0                         0
## 3  6743912                          0                         0
## 4  6743914                          0                         0
## 5  6743916                          0                         0
## 6  6743917                          0                         0
##   Actinobaculum_schaalii Actinobaculum_unclassified
## 1                      0                          0
## 2                      0                          0
## 3                      0                          0
## 4                      0                          0
## 5                      0                          0
## 6                      0                          0

Sanity check that relative abundances sum to 1

hmp2mDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  summary
##        .          
##  Min.   :0.05339  
##  1st Qu.:1.00000  
##  Median :1.00000  
##  Mean   :0.99762  
##  3rd Qu.:1.00000  
##  Max.   :1.00000
hmp2mDF %>%
  column_to_rownames("SampleID") %>%
  rowSums %>%
  as.data.frame() %>%
  ggplot(aes(x=`.`)) +
  geom_histogram()

Relative abundances of most samples sum to 1.

What is in samples that do not sum to 1?

hmp2MetaphlanPath %>%
  read_tsv %>%
  dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]{7}")) %>%
  select(ID, "6744527", "6744289") %>%
  filter(`6744527`>0|`6744289`>0) %>%
  formattable()
ID 6744527 6744289
k__Bacteria 100.00000 100.0000
k__Bacteria|p__Actinobacteria 94.66109 46.8811
k__Bacteria|p__Actinobacteria|c__Actinobacteria 94.66109 46.8811
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 94.66109 46.8811
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 94.66109 46.8811
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacteriaceae_unclassified 94.66109 46.8811
k__Bacteria|p__Firmicutes 5.33891 53.1189
k__Bacteria|p__Firmicutes|c__Bacilli 5.33891 53.1189
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales 5.33891 53.1189
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae 5.33891 53.1189
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus 5.33891 53.1189
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_gasseri 1.04474 0.0000
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_gasseri|t__Lactobacillus_gasseri_unclassified 1.04474 0.0000
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_helveticus 4.29417 0.0000
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_helveticus|t__Lactobacillus_helveticus_unclassified 4.29417 0.0000
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners 0.00000 53.1189
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners|t__Lactobacillus_iners_unclassified 0.00000 53.1189

Gardnerella Mapping

Add relative abundance of individual clades and genomospecies in Stanford and UAB cohort samples, and check for differences in finding Gardnerella by MetaPhlAn2 and mapping method.

Stanford and UAB Clades

suGardCladeAb0 <- suGardCladePath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propClade) %>%
  filter(n>=2) %>% #count as present if >=2 reads are mapped to that clade/genomospecies
  group_by(SampleID) %>%
  mutate(propClade=n/sum(n)) %>%
  ungroup() %>%
  full_join(sumDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propClade*Gardnerella_vaginalis)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
suGardCladeAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 1 × 6
##   SampleID   Clade     n propClade Gardnerella_vaginalis relativeAbundance
##   <chr>      <chr> <dbl>     <dbl>                 <dbl>             <dbl>
## 1 4008435358 C4        2         1                     0                 0
#check if there are any samples where Gard was found by metaphlan but not by mapping method
suGardCladeAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 0 × 6
## # … with 6 variables: SampleID <chr>, Clade <chr>, n <dbl>, propClade <dbl>,
## #   Gardnerella_vaginalis <dbl>, relativeAbundance <dbl>

There is one sample where Gardnerella clades were found by mapping method and not by MetaPhlAn2, and no samples where Gardnerella was found by MetaPhlAn2 but not by mapping method

# make table of clade abundances
suGardCladeAb <- suGardCladeAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Clade, relativeAbundance)

Stanford and UAB Genomospecies

suGardGenomoAb0 <- suGardGenomoPath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propGenomospecies) %>%
  filter(n>=2) %>%
  group_by(SampleID) %>%
  mutate(propGenomospecies=n/sum(n)) %>%
  ungroup() %>%
  full_join(sumDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propGenomospecies*Gardnerella_vaginalis)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
suGardGenomoAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 0 × 6
## # … with 6 variables: SampleID <chr>, Genomospecies <chr>, n <dbl>,
## #   propGenomospecies <dbl>, Gardnerella_vaginalis <dbl>,
## #   relativeAbundance <dbl>
#check if there are any samples where Gard was found by metaphlan but not by mapping method
suGardGenomoAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 1 × 6
##   SampleID   Genomospecies     n propGenomospecies Gardnerella_vaginalis
##   <chr>      <chr>         <dbl>             <dbl>                 <dbl>
## 1 1061235278 <NA>             NA                NA                0.0152
## # … with 1 more variable: relativeAbundance <dbl>

No samples where Gardnerella genomospecies were found by mapping method but not MetaPhlAn2 and one sample where Gardnerella was found by MetaPhlAn2 but not by mapping method.

# make table of genomospecies abundances
suGardGenomoAb <- suGardGenomoAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Genomospecies, relativeAbundance)

MOMS-PI Clades

hmp2GardCladeAb0 <- hmp2GardCladePath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propClade) %>%
  filter(n>=2) %>% # 2+ reads need to map to a clade/genomospecies to be considered present
  group_by(SampleID) %>%
  mutate(propClade=n/sum(n)) %>%
  ungroup() %>%
  full_join(hmp2mDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propClade*Gardnerella_vaginalis) %>%
  filter(SampleID %in% QCfilterSamples)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
hmp2GardCladeAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 9 × 6
##   SampleID Clade     n propClade Gardnerella_vaginalis relativeAbundance
##   <chr>    <chr> <dbl>     <dbl>                 <dbl>             <dbl>
## 1 6744255  C2        2         1                     0                 0
## 2 6744416  C4        2         1                     0                 0
## 3 6744438  C3        2         1                     0                 0
## 4 6744478  C4        6         1                     0                 0
## 5 6744556  C4        2         1                     0                 0
## 6 6744891  C3        2         1                     0                 0
## 7 6744963  C4        2         1                     0                 0
## 8 6745078  C4        2         1                     0                 0
## 9 6745186  C3        2         1                     0                 0
#check if there are any samples where Gard was found by metaphlan but not by mapping method
hmp2GardCladeAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 58 × 6
##    SampleID Clade     n propClade Gardnerella_vaginalis relativeAbundance
##    <chr>    <chr> <dbl>     <dbl>                 <dbl>             <dbl>
##  1 6743917  <NA>     NA        NA              0.00304                 NA
##  2 6743919  <NA>     NA        NA              0.0261                  NA
##  3 6743922  <NA>     NA        NA              0.00221                 NA
##  4 6743987  <NA>     NA        NA              0.000390                NA
##  5 6743992  <NA>     NA        NA              0.0289                  NA
##  6 6744004  <NA>     NA        NA              0.000170                NA
##  7 6744017  <NA>     NA        NA              0.105                   NA
##  8 6744034  <NA>     NA        NA              1                       NA
##  9 6744039  <NA>     NA        NA              0.00244                 NA
## 10 6744040  <NA>     NA        NA              0.000451                NA
## # … with 48 more rows

There are 58 samples where Gardnerella was found by MetaPhlan2 but no clades found by mapping method, and 9 samples where Gardnerella was found by mapping method but not by MetaPhlan2

# make abundance table
hmp2GardCladeAb <- hmp2GardCladeAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Clade, relativeAbundance)

MOMS-PI Genomospecies

hmp2GardGenomoAb0 <- hmp2GardGenomoPath %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propGenomospecies) %>%
  filter(n>=2) %>% # 2+ reads mapped to each clade/genomospecies to be considered present
  group_by(SampleID) %>%
  mutate(propGenomospecies=n/sum(n)) %>%
  ungroup() %>%
  full_join(hmp2mDF[,c("SampleID", "Gardnerella_vaginalis")], by=c("SampleID")) %>%
  mutate(relativeAbundance=propGenomospecies*Gardnerella_vaginalis)

# check if there are any samples where Gard was not found by metaphlan but was found by mapping method
hmp2GardGenomoAb0 %>%
  filter(Gardnerella_vaginalis==0, n>1)
## # A tibble: 2 × 6
##   SampleID Genomospecies     n propGenomospecies Gardnerella_vaginalis
##   <chr>    <chr>         <dbl>             <dbl>                 <dbl>
## 1 6744255  GS4               2                 1                     0
## 2 6744478  GS5               5                 1                     0
## # … with 1 more variable: relativeAbundance <dbl>
#check if there are any samples where Gard was found by metaphlan but not by mapping method
hmp2GardGenomoAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 127 × 6
##    SampleID Genomospecies     n propGenomospecies Gardnerella_vaginalis
##    <chr>    <chr>         <dbl>             <dbl>                 <dbl>
##  1 6743917  <NA>             NA                NA              0.00304 
##  2 6743919  <NA>             NA                NA              0.0261  
##  3 6743922  <NA>             NA                NA              0.00221 
##  4 6743926  <NA>             NA                NA              0.0128  
##  5 6743955  <NA>             NA                NA              0.00157 
##  6 6743987  <NA>             NA                NA              0.000390
##  7 6743992  <NA>             NA                NA              0.0289  
##  8 6744004  <NA>             NA                NA              0.000170
##  9 6744017  <NA>             NA                NA              0.105   
## 10 6744034  <NA>             NA                NA              1       
## # … with 117 more rows, and 1 more variable: relativeAbundance <dbl>

127 samples where Gardnerella was found by MetaPhlan2 but no genomospecies were found by mapping method and 2 samples where Gardnerella was found by mapping method but not by MetaPhlan2

#make abundance table
hmp2GardGenomoAb <- hmp2GardGenomoAb0 %>%
  filter(relativeAbundance>0) %>%
  select(SampleID, Genomospecies, relativeAbundance)

Further assess uncharacterized samples:

Asses samples where Gardnerella was uncharacterized at the clade level. (Where Gardnerella was found by MetaPhlAn2 but no clades were identified using the mapping method).

unchGardSamples <- hmp2GardCladeAb0 %>%
  filter(is.na(n), Gardnerella_vaginalis>0) %>%
  .$SampleID

Comparison of number of microbial paired reads and proportion Gardnerella in samples with uncharacterized Gardnerella vs. samples where Gardnerella was found with both methods.

q <- hmp2mDF %>%
  select(SampleID, Gardnerella_vaginalis) %>%
  mutate(unchGard=case_when(SampleID %in% unchGardSamples~TRUE,
                            !(SampleID %in% unchGardSamples)~FALSE)) %>%
  left_join(sgDF1[,c("SampleID", "bbmapFiltered_pairs", "bacLoad")], by="SampleID") %>%
  ggplot(aes(x=Gardnerella_vaginalis, y=bbmapFiltered_pairs, color=unchGard, shape=unchGard)) +
  geom_point(alpha=0.5) +
  theme(legend.position = "bottom") +
  binaryColors +
  labs(x="Proportion Gardnerella", y="Microbial Reads", color="Uncharacterized Gardnerella", shape="Uncharacterized Gardnerella")

ggMarginal(q, groupFill = TRUE, type="density")

Samples with uncharacterized Gardnerella have fewer microbial reads and a lower proportion of Gardnerella

Join abundance tables

# Gardnerella abundance
gardCladeAb0 <- full_join(suGardCladeAb, hmp2GardCladeAb, by = c("SampleID", "Clade", "relativeAbundance"))
gardGenomoAb0 <- full_join(suGardGenomoAb, hmp2GardGenomoAb, by = c("SampleID", "Genomospecies", "relativeAbundance"))

# metaphlan
mDF0 <- sumDF %>%
  full_join(hmp2mDF)

#number of speacies that appear in only one vs. both samples
testOverlap <- summarise_all(mDF0, anyNA) %>%
  gather("Species", "anyNA", 2:ncol(.)) %>%
  select(-SampleID)

nrow(testOverlap)
## [1] 364
table(testOverlap$anyNA)
## 
## FALSE  TRUE 
##   209   155

364 total species and 209 appear in both studies (Stanford/UAB vs. MOMS-PI). Number of species in each study:

ncol(sumDF)-1
## [1] 225
ncol(hmp2mDF)-1
## [1] 348

Replace NAs with zeros for abundance values for species that did not appear in each respective study.

mDF1 <- mDF0 %>%
  mutate_if(is.numeric, ~replace_na(.x, 0)) %>%
  left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
  select(SampleID, Cohort, everything())
anyNA(mDF1) # check for remaining NAs
## [1] FALSE

Select enriched Gardnerella MOMS-PI subset

Proportion *Gardnerella* in samples by study:
mDF1 %>%
  select(SampleID, Cohort, Gardnerella_vaginalis) %>%
  mutate(Cohort=case_when(Cohort=="Stanford Enriched"|Cohort=="UAB Enriched"~"Stanford & UAB Enriched",
                          Cohort=="MOMS-PI"~"MOMS-PI"),
         Cohort=factor(Cohort, levels = c("Stanford & UAB Enriched", "MOMS-PI"))) %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort, group=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  labs(x="Proportion Gardnerella")

gardAbundance2Cohort <- mDF1 %>%
  select(SampleID, Cohort, Gardnerella_vaginalis) %>%
  left_join(sgDF1[,c("SampleID", "SubjectID")], by="SampleID") %>%
  mutate(Cohort=case_when(Cohort=="Stanford Enriched"|Cohort=="UAB Enriched"~"Stanford & UAB Enriched",
                          Cohort=="MOMS-PI"~"MOMS-PI"),
         Cohort=factor(Cohort, levels = c("Stanford & UAB Enriched", "MOMS-PI")))
gardAbundance2Cohort %>%
  group_by(Cohort) %>%
  summarize(min=min(Gardnerella_vaginalis),
            `1stQu`=summary(Gardnerella_vaginalis)[2],
            median=median(Gardnerella_vaginalis),
            mean=mean(Gardnerella_vaginalis),
            `3rdQu`=summary(Gardnerella_vaginalis)[5],
            max=max(Gardnerella_vaginalis)) %>%
  formattable()
Cohort min 1stQu median mean 3rdQu max
Stanford & UAB Enriched 0 0.1712211 0.5132085 0.4505522 0.6993391 0.9814731
MOMS-PI 0 0.0036418 0.0566380 0.2401044 0.4823945 1.0000000

Proportion Gardnerella by as subject averages by study:

subjectGardAbundance2Cohort <- gardAbundance2Cohort %>%
  group_by(SubjectID, Cohort) %>%
  summarize(Gardnerella_vaginalis=mean(Gardnerella_vaginalis)) %>%
  ungroup
  
subjectAvgPropGard <- subjectGardAbundance2Cohort %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  theme(legend.position = "bottom", 
        legend.text = element_text(size=10), 
        legend.title = element_text(size=11)) +
  labs(x="Subject Average Proportion Gardnerella")
subjectAvgPropGard

MGS sequenced samples from Stanford and UAB were selected based on enrichment of Gardnerella at the subject level. Create a cohort of a subset of samples in the MOMS-PI cohort that are also selected for enrichment of Gardnerella at the subject level for better comparison.

ntiles <- subjectGardAbundance2Cohort %>%
  filter(Cohort=="Stanford & UAB Enriched") %>%
  mutate(decile=ntile(Gardnerella_vaginalis, 3)) %>%
  group_by(decile) %>%
  filter(Gardnerella_vaginalis==min(Gardnerella_vaginalis)) %>%
  .$Gardnerella_vaginalis %>%
  sort %>%
  unique
ntiles
## [1] 0.0000000 0.3569745 0.6732104
# subjectGardAbundance2Cohort0 <- subjectGardAbundance2Cohort %>%
#   filter(Cohort=="MOMS-PI",
#          SubjectID %in% hmp2SubjectsWDel) %>% # *** For selecting subjects that have gestational age at delivery  ***
#   mutate(Ntile=case_when(between(Gardnerella_vaginalis, ntiles[1], ntiles[2])~1,
#                         between(Gardnerella_vaginalis, ntiles[2], ntiles[3])~2,
#                         Gardnerella_vaginalis > ntiles[3]~3))
# 
# subjN <- min(count(subjectGardAbundance2Cohort0, Ntile)$n) # number of subjects from each ntile to take
# 
# HMP2highGardSubjects <- subjectGardAbundance2Cohort0 %>% 
#   group_by(Ntile) %>%
#   sample_n(subjN, replace = FALSE) %>%
#   .$SubjectID
#  
# write_lines(HMP2highGardSubjects, "./HMP2highGardSubjects.txt")
HMP2highGardSubjects <- read_lines("./HMP2highGardSubjects.txt")

Figure S4: Distributions of subject average Gardnerella abundances

#fig.width=5, fig.height=2.5
subjectAvgPropGardEnr <- subjectGardAbundance2Cohort %>%
  filter(Cohort=="Stanford & UAB Enriched"|(Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects)) %>%
  mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort, group=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  theme(legend.position = "bottom", 
        legend.text = element_text(size=10), 
        legend.title = element_text(size=11)) +
  labs(x="Subject Average Proportion Gardnerella")

plot_grid(subjectAvgPropGard, subjectAvgPropGardEnr, nrow=1, labels = c("A", "B"), label_size = 15)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS4_enrichedSubjectPropGardbyStudy.png", sep="_")))
subjectGardAbundance2Cohort %>%
  group_by(Cohort) %>%
  summarize(min=min(Gardnerella_vaginalis),
            `1stQu`=summary(Gardnerella_vaginalis)[2],
            median=median(Gardnerella_vaginalis),
            mean=mean(Gardnerella_vaginalis),
            `3rdQu`=summary(Gardnerella_vaginalis)[5],
            max=max(Gardnerella_vaginalis)) %>%
  formattable()
Cohort min 1stQu median mean 3rdQu max
Stanford & UAB Enriched 0 0.25958767 0.4686271 0.4515941 0.6906473 0.9041753
MOMS-PI 0 0.01845903 0.1567142 0.2396632 0.3973811 0.9826293
subjectGardAbundance2Cohort %>%
  filter((Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects)) %>%
  mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
  group_by(Cohort) %>%
  summarize(min=min(Gardnerella_vaginalis),
            `1stQu`=summary(Gardnerella_vaginalis)[2],
            median=median(Gardnerella_vaginalis),
            mean=mean(Gardnerella_vaginalis),
            `3rdQu`=summary(Gardnerella_vaginalis)[5],
            max=max(Gardnerella_vaginalis)) %>%
  formattable()
Cohort min 1stQu median mean 3rdQu max
MOMS-PI Enriched 0.00218635 0.2808695 0.4526737 0.4684783 0.722244 0.8500606

Distribution of Gardnerella abundnace in samples

HMP2HighGardSamples <- sgDF1 %>%
  filter(SubjectID %in% HMP2highGardSubjects) %>%
  .$SampleID

gardAbundance2Cohort %>%
  filter(Cohort=="Stanford & UAB Enriched"|Cohort=="MOMS-PI"&SubjectID %in% HMP2highGardSubjects) %>%
  mutate(Cohort=factor(Cohort, levels=c("Stanford & UAB Enriched", "MOMS-PI"), labels=c("Stanford & UAB Enriched", "MOMS-PI Enriched"))) %>%
  ggplot(aes(x=Gardnerella_vaginalis, color=Cohort, fill=Cohort)) +
  geom_density(alpha=0.5) +
  binaryColors +
  labs(x="Proportion of Gardnerella")

Add enriched MOMS-PI cohort to tables Sample and subject metadata:

# extract enriched Gardnerella subset and full join to metadata 
highGardSgDF <- sgDF1 %>%
  filter(SubjectID %in% HMP2highGardSubjects) %>% 
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"),
         SubjectID=paste0(SubjectID, "_E"))

nrow(highGardSgDF)  
## [1] 145
sgDF <- sgDF1 %>%
  mutate(Cohort=as.vector(Cohort),
         SubjectID=as.character(SubjectID)) %>%
  full_join(highGardSgDF, by=colnames(sgDF1)) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts))

bacLoad <- sgDF %>%
  select(SampleID, bacLoad)

sgDF %>%
  group_by(Cohort) %>%
  summarize(`N Subjects`=n_distinct(SubjectID), `N Samples`=n_distinct(SampleID)) %>%
  formattable()
Cohort N Subjects N Samples
Stanford Enriched 20 62
UAB Enriched 15 45
MOMS-PI Enriched 42 145
MOMS-PI 231 781

MetaPhlAn2 species abundance tables:

highGardmDF <- mDF1 %>%
  filter(SampleID %in% HMP2HighGardSamples) %>%
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"))

mDF <- mDF1 %>%
  mutate(Cohort=as.vector(Cohort)) %>%
  rbind(highGardmDF) %>%
  mutate(Cohort=factor(Cohort, levels = c("Stanford Enriched", "UAB Enriched", "MOMS-PI Enriched", "MOMS-PI")))

Gardnerella clade and genomospecies abundance tables:

# Clade Abundances
gardCladeAb1 <- gardCladeAb0 %>%
  left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
  mutate(Cohort=as.vector(Cohort))
 
highGardCladeAb <- gardCladeAb1 %>%
  filter(SampleID %in% HMP2HighGardSamples) %>%
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"))

gardCladeAb <- gardCladeAb1 %>%
  rbind(highGardCladeAb) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts))

# Gemomospecies Abundances
gardGenomoAb1 <- gardGenomoAb0 %>%
  left_join(sgDF1[,c("SampleID", "Cohort")], by="SampleID") %>%
  mutate(Cohort=as.vector(Cohort))
 
highGardGenomoAb <- gardGenomoAb1 %>%
  filter(SampleID %in% HMP2HighGardSamples) %>%
  mutate(Cohort="MOMS-PI Enriched", 
         SampleID=paste0(SampleID, "_E"))

gardGenomoAb <- gardGenomoAb1 %>%
  rbind(highGardGenomoAb) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts))

# note these tables contain samples with at least one identified clade or genomospecies of gardnerella

Top taxa

avgTop20Abundance0 <- mDF %>%
  select(SampleID, Cohort, everything()) %>%
  gather("Species", "Abundance", 3:ncol(.)) %>%
  group_by(Cohort, Species) %>%
  summarise(avgAbundance=mean(Abundance)) 

top20taxa <- avgTop20Abundance0 %>%
  group_by(Cohort) %>%
  top_n(20, avgAbundance) %>%
  .$Species %>%
  unique

avgTop20Abundance0 %>%
  spread(Cohort, avgAbundance) %>%
  mutate(stanfordRank=rank(-`Stanford Enriched`),
         uabRank=rank(-`UAB Enriched`), 
         momspiRank=rank(-`MOMS-PI`),
         momspiERank=rank(-`MOMS-PI Enriched`),
         Stanford=round(`Stanford Enriched`, 2),
         UAB=round(`UAB Enriched`,2),
         `MOMS-PI`=round(`MOMS-PI`, 2),
         `MOMS-PI Enriched`=round(`MOMS-PI Enriched`, 2)) %>%
  filter(Species %in% top20taxa) %>%
  select(Species, `Stanford Enriched`, stanfordRank, `UAB Enriched`, uabRank, `MOMS-PI Enriched`, momspiERank, `MOMS-PI`, momspiRank) %>%
  arrange(stanfordRank) %>%
  formattable()
Species Stanford Enriched stanfordRank UAB Enriched uabRank MOMS-PI Enriched momspiERank MOMS-PI momspiRank
Gardnerella_vaginalis 3.701475e-01 1 5.613321e-01 1.0 0.46 1 0.24 2
Lactobacillus_iners 1.657524e-01 2 1.991789e-01 2.0 0.24 2 0.33 1
Lactobacillus_crispatus 1.387812e-01 3 3.084282e-02 4.0 0.06 3 0.17 3
Lactobacillus_gasseri 9.288219e-02 4 1.278889e-05 111.0 0.01 10 0.04 5
Atopobium_vaginae 5.537944e-02 5 7.964385e-02 3.0 0.06 4 0.03 6
Lactobacillus_jensenii 5.374158e-02 6 5.861467e-04 34.0 0.03 5 0.05 4
Prevotella_timonensis 1.395939e-02 7 4.420527e-03 12.0 0.02 9 0.01 7
Ureaplasma_parvum 8.847260e-03 8 1.727678e-03 20.0 0.00 15 0.01 13
Prevotella_bivia 7.590635e-03 9 1.827222e-02 5.0 0.01 12 0.01 14
Finegoldia_magna 7.540677e-03 10 1.186363e-02 7.0 0.00 33 0.00 19
Megasphaera_genomosp_type_1 7.273266e-03 11 8.389764e-03 9.0 0.02 7 0.01 9
Prevotella_amnii 7.232839e-03 12 1.149809e-02 8.0 0.02 8 0.01 12
Mycoplasma_hominis 6.399960e-03 13 1.373242e-02 6.0 0.01 11 0.01 10
Megasphaera_unclassified 6.204035e-03 14 5.562682e-03 11.0 0.01 13 0.00 17
Ureaplasma_urealyticum 5.318142e-03 15 1.671022e-03 21.0 0.00 23 0.00 27
Varibaculum_cambriense 3.935813e-03 16 6.948800e-04 33.0 0.00 67 0.00 71
Dialister_micraerophilus 3.856794e-03 17 2.908351e-03 14.0 0.01 14 0.00 15
Veillonella_unclassified 3.408242e-03 18 7.272222e-05 78.0 0.00 120 0.00 72
Peptoniphilus_harei 3.114544e-03 19 7.010158e-03 10.0 0.00 37 0.00 23
Oligella_urethralis 2.957774e-03 20 3.871511e-04 45.0 0.00 113 0.00 41
Prevotella_disiens 2.237563e-03 21 3.915562e-03 13.0 0.00 39 0.00 20
Streptococcus_anginosus 1.453100e-03 26 2.524624e-03 15.0 0.00 31 0.00 32
Bifidobacterium_breve 1.129426e-03 27 1.512753e-03 22.0 0.00 18 0.00 30
Peptoniphilus_lacrimalis 1.115758e-03 29 2.299044e-03 18.0 0.00 42 0.00 49
Corynebacterium_jeikeium 5.682258e-04 39 1.828533e-03 19.0 0.00 115 0.00 171
Streptococcus_mitis_oralis_pneumoniae 5.078548e-04 42 1.508578e-04 72.0 0.00 19 0.00 24
Lactobacillus_phage_Lv_1 4.896242e-04 44 0.000000e+00 255.5 0.00 16 0.01 8
Corynebacterium_sp_HFH0082 3.812645e-04 50 2.431147e-03 16.0 0.00 256 0.00 39
Clostridiales_genomosp_BVAB3 3.396290e-05 103 8.818867e-04 25.0 0.00 20 0.00 43
Bacteroides_fragilis 8.709677e-07 167 0.000000e+00 255.5 0.00 17 0.00 59
Lactobacillus_delbrueckii 4.419355e-07 173 0.000000e+00 255.5 0.00 256 0.00 16
Prevotella_melaninogenica 6.935484e-08 184 2.406838e-03 17.0 0.00 110 0.00 69
Bradyrhizobium_sp_DFCI_1 0.000000e+00 275 0.000000e+00 255.5 0.02 6 0.01 11
Lactobacillus_helveticus 0.000000e+00 275 0.000000e+00 255.5 0.00 256 0.00 18
x <- mDF %>%
  select(SampleID, Cohort, top20taxa) %>%
  mutate_if(is.numeric, ~case_when(.x>0~1,
                                   .x==0~0))%>%
  group_by(Cohort) %>%
  summarize_if(is.numeric, ~sum(.x)/n()) %>%
  data.frame()
rownames(x) <- x$Cohort
x %>%
  select(-Cohort) %>%
  t %>%
  as.data.frame() %>%
  rownames_to_column("Species") %>%
  arrange(-`Stanford Enriched`) %>%
  formattable()
Species Stanford Enriched UAB Enriched MOMS-PI Enriched MOMS-PI
Gardnerella_vaginalis 0.85483871 0.95555556 0.958620690 0.852752881
Atopobium_vaginae 0.62903226 0.84444444 0.731034483 0.494238156
Finegoldia_magna 0.58064516 0.60000000 0.131034483 0.186939821
Dialister_micraerophilus 0.54838710 0.80000000 0.655172414 0.400768246
Prevotella_timonensis 0.53225806 0.71111111 0.468965517 0.368758003
Lactobacillus_iners 0.45161290 0.91111111 0.924137931 0.834827145
Prevotella_bivia 0.41935484 0.55555556 0.289655172 0.239436620
Lactobacillus_gasseri 0.38709677 0.08888889 0.082758621 0.161331626
Peptoniphilus_harei 0.38709677 0.66666667 0.172413793 0.167733675
Ureaplasma_parvum 0.35483871 0.53333333 0.455172414 0.430217670
Megasphaera_genomosp_type_1 0.33870968 0.80000000 0.620689655 0.395646607
Prevotella_disiens 0.33870968 0.60000000 0.117241379 0.157490397
Lactobacillus_crispatus 0.27419355 0.08888889 0.296551724 0.362355954
Megasphaera_unclassified 0.27419355 0.57777778 0.393103448 0.218950064
Prevotella_amnii 0.25806452 0.46666667 0.462068966 0.271446863
Peptoniphilus_lacrimalis 0.25806452 0.46666667 0.137931034 0.134443022
Streptococcus_anginosus 0.25806452 0.40000000 0.089655172 0.084507042
Lactobacillus_jensenii 0.24193548 0.15555556 0.193103448 0.307298335
Ureaplasma_urealyticum 0.20967742 0.28888889 0.089655172 0.056338028
Bifidobacterium_breve 0.19354839 0.15555556 0.068965517 0.042253521
Varibaculum_cambriense 0.17741935 0.28888889 0.027586207 0.070422535
Mycoplasma_hominis 0.16129032 0.68888889 0.344827586 0.243277849
Corynebacterium_jeikeium 0.12903226 0.26666667 0.020689655 0.026888604
Oligella_urethralis 0.06451613 0.06666667 0.013793103 0.021766965
Streptococcus_mitis_oralis_pneumoniae 0.06451613 0.04444444 0.041379310 0.040973111
Veillonella_unclassified 0.04838710 0.08888889 0.013793103 0.024327785
Clostridiales_genomosp_BVAB3 0.04838710 0.35555556 0.206896552 0.107554417
Corynebacterium_sp_HFH0082 0.03225806 0.08888889 0.000000000 0.003841229
Bacteroides_fragilis 0.03225806 0.00000000 0.006896552 0.003841229
Prevotella_melaninogenica 0.01612903 0.17777778 0.013793103 0.025608195
Lactobacillus_phage_Lv_1 0.01612903 0.00000000 0.020689655 0.026888604
Lactobacillus_delbrueckii 0.01612903 0.00000000 0.000000000 0.010243278
Bradyrhizobium_sp_DFCI_1 0.00000000 0.00000000 0.103448276 0.062740077
Lactobacillus_helveticus 0.00000000 0.00000000 0.000000000 0.011523688

Create abundance table with taxa of interest

# abundance table of taxa of interest with clades
mDFtoiClades <- gardCladeAb %>%
  spread(Clade, relativeAbundance) %>%
  full_join(mDF, by=c("SampleID", "Cohort")) %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobes) %>% # keep total Gardnerella abundance in table for now. Many operations are taxon vs. taxon. Will need to remove this column when analyses call for it.
  mutate_at(vars(c(Gardnerella_vaginalis, clades, lactos, anaerobes)), replace_na, 0) %>%
  mutate(Cohort=factor(Cohort, levels=cohorts))

Gardnerella variant abundance

Clades per sample

gardCladeAbU <- gardCladeAb %>%
  full_join(mDF[,c("SampleID", "Cohort", "Gardnerella_vaginalis")], by=c("SampleID","Cohort")) %>%
  spread(Clade, relativeAbundance) %>%
  select(-`<NA>`) %>% # remove "NA column" created by samples that have zero Gardnerella
  mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>=0&is.na(C1)&is.na(C2)&is.na(C3)&is.na(C4)&is.na(C5)&is.na(C6)~Gardnerella_vaginalis,
                                               Gardnerella_vaginalis>0&!is.na(clades)~0)) %>%
  gather("var", "relativeAbundance", c(clades, Gardnerella_vaginalis, Uncharacterized_Gardnerella)) %>%
  replace_na(list(relativeAbundance=0))

Average clades per sample in each cohorts:

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(Cohort, summarize, mean=mean(nClades), `standard deviation`=sd(nClades)) %>%
  formattable
Cohort mean standard deviation
Stanford Enriched 3.016129 1.894703
UAB Enriched 4.777778 1.593864
MOMS-PI Enriched 3.772414 1.892033
MOMS-PI 2.655570 2.088537

Average clades per sample across all cohorts:

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  summarize(mean=mean(nClades), `standard deviation`=sd(nClades)) %>%
  formattable
mean standard deviation
2.926428 2.103064
cladesPerSample <- gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  count(Cohort, nClades) %>%
  group_by(Cohort) %>%
  mutate(n=n/sum(n)) %>%
  ggplot(aes(x=nClades, y=n)) +
  geom_bar(stat="identity", fill = "black") +
  facet_wrap(~Cohort) +
  labs(x="Clades per Sample", y="Proportion of Samples", fill=bquote('Uncharacterized'~italic(Gardnerella))) +
  theme(axis.text.x = element_text(size=9),
        strip.text = element_text(size=13),
        aspect.ratio = 0.5)

Genomospecies per sample

gardGenomoAbU <- gardGenomoAb %>%
  spread(Genomospecies, relativeAbundance) %>%
  full_join(mDF[,c("SampleID", "Cohort", "Gardnerella_vaginalis")], by=c("SampleID", "Cohort")) %>%
  mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>=0&is.na(GS1)&is.na(GS2)&is.na(GS3)&is.na(GS4)&is.na(GS5)&is.na(GS6)&is.na(GS7)&is.na(GS8)&is.na(GS9)&is.na(GS10)&is.na(GS11)&is.na(GS12)&is.na(GS13)&is.na(GS14)~Gardnerella_vaginalis,
                                               Gardnerella_vaginalis>0&!is.na(genomos)~0)) %>%
  gather("var", "relativeAbundance", c(genomos, Gardnerella_vaginalis, Uncharacterized_Gardnerella)) %>%
  replace_na(list(relativeAbundance=0))

Average clades per sample in each cohorts:

 gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  with_groups(Cohort, summarize, mean=mean(nGenomos), `standard deviation`=sd(nGenomos)) %>%
  formattable
Cohort mean standard deviation
Stanford Enriched 4.016129 3.226387
UAB Enriched 9.177778 4.339506
MOMS-PI Enriched 5.379310 3.811658
MOMS-PI 3.382843 3.750646

Number of samples with all 14 genomospecies

 gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  count(Cohort, nGenomos) %>%
  group_by(Cohort) %>%
  mutate(pcnt=n/sum(n)*100) %>% # convert to fraction of sample
  filter(nGenomos==14) %>%
  formattable
Cohort nGenomos n pcnt
Stanford Enriched 14 1 1.6129032
UAB Enriched 14 12 26.6666667
MOMS-PI Enriched 14 1 0.6896552
MOMS-PI 14 6 0.7682458

Average clades per sample across all cohorts:

 gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  summarize(mean=mean(nGenomos), `standard deviation`=sd(nGenomos)) %>%
  formattable
mean standard deviation
3.953533 3.974942
genomosPerSample <- gardGenomoAbU %>%
  spread(var, relativeAbundance) %>%
  mutate_at(genomos, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nGenomos=GS1+GS2+GS3+GS4+GS5+GS6+GS7+GS8+GS9+GS10+GS11+GS12+GS13+GS14) %>%
  count(Cohort, nGenomos) %>%
  group_by(Cohort) %>%
  mutate(n=n/sum(n), # convert to fraction of sample
         nGenomos=factor(nGenomos, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"))) %>% 
  ggplot(aes(x=nGenomos, y=n)) +
  geom_bar(stat="identity", fill = "black") +
  facet_wrap(~Cohort) +
  labs(x="Genomospecies per Sample", y="Proportion of Samples") +
  theme(axis.text.x = element_text(size=9),
        strip.text = element_text(size=13),
        aspect.ratio = 0.5)

Variant Relative abundance

Clades

Calculate clade prevalences

prevTable <- gardCladeAbU %>%
  with_groups(c(Cohort, var), summarize, p = sum(relativeAbundance > 0), n = n(), m=mean(relativeAbundance), sd=sd(relativeAbundance)) %>% 
  mutate(prevalence=round(p/n, 2),
         var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella",  "Gardnerella_vaginalis"), labels=c(clades, "Unch.", "Total")))

Sample relative abundances with prevalence

#fig.width=7, fig.height=3.5
gardCladeAbU %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  mutate(var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels=c(clades, "Total"))) %>%
  left_join(prevTable[,c("Cohort", "var", "prevalence")], by=c("Cohort", "var")) %>%
  mutate(Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=relativeAbundance, color=var, fill=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(aes(y=1.2, x=var, label=prevalence), color="black", size=5) +
  scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1, 1.2), limits = c(0,1.3), labels=c("0.00", "0.25", "0.50", "0.75", "1.00", "prevalence")) +
  cladeColorsTotal +
  facet_grid(Cohort~.) +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 16),
        axis.text.y = element_text(size=16),
        axis.title.y = element_text(size=23),
        strip.text = element_text(size=17)) +
  labs(x=NULL, y="Relative Abundance")

Clades differ in abundance and abundance of clades varies across cohorts. Clade 3 is less abundant in Stanford cohort but abundant in other cohorts, especially UAB. Clade 1 is more abundant in Stanford. Clade 6 is rare in Stanford but appears in UAB and MOMS-PI cohorts.

Prevalence vs relative abundance of Gardnerella clades

cladeAbvPrev <- prevTable %>%
  filter(var!="Unch.") %>%
  ggplot(aes(x=prevalence, y=m, color=var)) +
  geom_point(size=3, alpha=0.75) +
  scale_x_continuous(limits=c(0,1), breaks=seq(0,1,0.2)) +
  scale_y_continuous(limits =c(0,1), breaks=seq(0,1,0.2)) +
  facet_wrap(~Cohort) +
  cladeColorsTotal +
  coord_fixed() +
  theme(legend.position = "right",
        strip.text = element_text(size=13)) +
  labs(x="Prevalence", y="Mean Relative Abundance", color="Clade")

Genomospecies

Calculate genomospecies prevalences

genomosPrevTable <- gardGenomoAbU %>%
  with_groups(c(Cohort, var), summarize, p = sum(relativeAbundance > 0), n = n(), m=mean(relativeAbundance), sd=sd(relativeAbundance)) %>% 
  mutate(prevalence=round(p/n, 2))

Sample relative abundance with prevalence

#fig.height=3, fig.width=7
gardGenomoAbU %>%
  left_join(genomosPrevTable[,c("Cohort", "var", "prevalence")], by=c("Cohort", "var")) %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  mutate(var=factor(var, levels=c(genomosCladeOrder, "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total")),
         Cohort=factor(Cohort, levels=cohorts, labels = cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=relativeAbundance, color=var, fill=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(aes(y=1.2, x=var, label=prevalence), color="black", size=5) +
  scale_y_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1, 1.2), limits = c(0,1.3), labels=c("0.00", "0.25", "0.50", "0.75", "1.00", "prevalence")) +
  facet_grid(Cohort~.) +
  genomoColorsTotal +
  theme(axis.text.x = element_text(size=11),
        axis.text.y = element_text(size=12),
        axis.title.y = element_text(size=20),
        strip.text = element_text(size = 11),
        legend.position="none") +
  labs(x=NULL, y="Relative Abundance")

Prevalence vs relative abundance of Gardnerella genomospecies

genomoAbvPrev <- gardGenomoAbU %>%
  group_by(Cohort, var) %>%
  summarise(avgAbundance=mean(relativeAbundance)) %>%
  left_join(genomosPrevTable, by=c("Cohort", "var")) %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  dplyr::rename(Genomospecies=var) %>%
  mutate(Genomospecies=factor(Genomospecies, levels = c(genomosCladeOrder, "Gardnerella_vaginalis"))) %>%
  ggplot(aes(x=prevalence, y=avgAbundance, color=Genomospecies)) +
  geom_point(size=3, alpha=0.75) +
  scale_x_continuous(limits=c(0,1), breaks=seq(0,1,0.2)) +
  scale_y_continuous(limits =c(0,1), breaks=seq(0,1,0.2)) +
  genomoColorsTotal +
  facet_wrap(~Cohort) +
  coord_fixed() +
  theme(legend.position = "right", 
        strip.text = element_text(size=13)) +
  labs(x="Prevalence", y="Mean Relative Abundance", color="Species")

Figure 3: Prevalence and abundance of Gardnerella variants across the four cohorts

#fig.width = 6, fig.height=4
varsPerSample <- plot_grid(cladesPerSample, genomosPerSample, nrow = 1, labels = c("A", "B"), label_size = 15)
gardVarAbun <- plot_grid(cladeAbvPrev, genomoAbvPrev, nrow = 1, labels = c("C", "D"), label_size = 15)
plot_grid(varsPerSample, gardVarAbun, nrow = 2, axis = "l")

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "Figure3_cladeGenomoPrevAbun.png", sep="_")))

Microbial Load

Compare microbial load across the four cohorts:

# table of descriptive statistics with outliers
sgDF %>%
  group_by(Cohort) %>%
  summarise_at("bacLoad", list(min=min, mean=mean, median=median, max=max)) %>%
  mutate_if(is.numeric, ~round(.x, 4)) %>%
  formattable()
Cohort min mean median max
Stanford Enriched 0.0035 2.5594 0.0504 150.0600
UAB Enriched 0.0036 0.4658 0.2504 6.0354
MOMS-PI Enriched 0.0111 0.2492 0.1259 4.7932
MOMS-PI 0.0038 0.2074 0.0446 47.6365
# microbial load of outliers
sgDF %>%
  filter(bacLoad>2) %>%
  select(SampleID, Cohort, bacLoad) %>%
  formattable
SampleID Cohort bacLoad
4009835178 UAB Enriched 6.035434
1005601348 Stanford Enriched 150.059967
6744677 MOMS-PI 4.103960
6743991 MOMS-PI 12.019563
6744354 MOMS-PI 47.636472
6744871 MOMS-PI 4.793229
6744459 MOMS-PI 4.739440
6744677_E MOMS-PI Enriched 4.103960
6744871_E MOMS-PI Enriched 4.793229
# density plot without outliers (bacload ≥ 2)
sgDF %>%
  filter(bacLoad<2) %>%
  mutate(Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=bacLoad, fill=Cohort, color=Cohort)) +
  geom_density(alpha=0.5) +
  quadColors2 +
  theme(legend.position = c(0.85,0.8)) +
  labs(x="Microbial Load")

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS5_bacLoadDensNoOut.png", sep="_")))


# table of descriptive statistics without outliers
sgDF %>%
  filter(bacLoad<2) %>%
  group_by(Cohort) %>%
  summarise_at("bacLoad", list(min=min, mean=mean, median=median, max=max)) %>%
  mutate_if(is.numeric, ~round(.x, 4)) %>%
  formattable()
Cohort min mean median max
Stanford Enriched 0.0035 0.1414 0.0498 1.0077
UAB Enriched 0.0036 0.3392 0.2417 1.2209
MOMS-PI Enriched 0.0111 0.1905 0.1201 1.4112
MOMS-PI 0.0038 0.1143 0.0441 1.8656

Absolute abundance

Clade aboslute abundance

All samples absolute abundance

#fig.width=7, fig.height=3.5
gardCladeAbU %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad, 
         var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=absoluteAbundance, color=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  cladeColorsTotalUnch +
  scale_y_continuous(limits=c(0,1), n.breaks = 5) +
  facet_grid(Cohort~.) +
  theme(legend.position = "none",
        axis.text.x = element_text(size=16),
        axis.text.y = element_text(size=16),
        axis.title.y = element_text(size=23),
        strip.text = element_text(size=17)) +
  labs(x=NULL, y="Absolute Abundance")

Clades 3 and 4 appear to show the greatest absolute abundance.

Genomospecies aboslute abundance

All samples absolute abundance

#fig.height=3, fig.width=7
gardGenomoAbU %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad, 
         var=factor(var, levels=c(genomosCladeOrder, "Uncharacterized_Gardnerella", "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Unch.", "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels = cohortAbbrevs)) %>%
  ggplot(aes(x=var, y=absoluteAbundance, color=var)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  facet_grid(Cohort~.) +  
  genomoColorsTotalUnch +
  theme(axis.text.x = element_text(size=11),
        axis.text.y = element_text(size=12),
        axis.title.y = element_text(size=20),
        strip.text = element_text(size = 11),
        legend.position="none") +
  labs(x=NULL, y="Absolute Abundance")

Genomospecies G. swidsinskii and genomospecies 7 appear to show the greatest absolute abundance.

Thresholds for presence-absence

Determine what threshold will be used to count a taxon as present vs. absent for co-occurrence and presence absence vs. microbial load analyses

MetaPhlAn2 will report all taxa that have 10% of marker genes non-zero based on this Google group response from Nicola Segata. Average number of marker genes per bacterial species is 184 ± 45. (Truong et al., 2015, Nature Mathods “MetaPhlAn2 for enhanced metagenomic taxonomic profiling”). About 1 million markers from >7,500 species. Minimum number of reads to be detected would be 18.4.

sgDF1 %>%
  mutate(ntile=ntile(bbmapFiltered_pairs, n=20)) %>%
  filter(ntile==5) %>%
  filter(bbmapFiltered_pairs==min(bbmapFiltered_pairs)) %>%
  .$bbmapFiltered_pairs
## [1] 104260
sgDF1 %>%
  mutate(over30K=bbmapFiltered_pairs>100000) %>%
  count(over30K) %>% 
  mutate(freq=n/sum(n))
## # A tibble: 2 × 3
##   over30K     n  freq
##   <lgl>   <int> <dbl>
## 1 FALSE     165 0.186
## 2 TRUE      723 0.814

80% of samples have greater than 100,000 read pairs. MetaPhlAn2 uses about 10% of the reads in the library, and 18.4/10,000 yields a functional limit of detection of roughly 0.1%.

18.4/10000 
## [1] 0.00184

A threshold of 0.1% has been used frequently in presence-absence analyses in microbiome data and will be used as a threshold here.

Gardnerella and microbial load

Microbial load by number of clades

loadVClades <- gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  gather("var", "abundance", clades) %>%
  group_by(SampleID, Cohort) %>%
  summarize(nClades=sum(abundance)) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=bacLoad)) +
  geom_jitter(alpha=0.25) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 1.5, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Microbial Load")
loadVClades

Is this a function of sequencing depth?

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  gather("var", "abundance", clades) %>%
  group_by(SampleID, Cohort) %>%
  summarize(nClades=sum(abundance)) %>%
  left_join(sgDF[,c("SampleID", "bacLoad", "Sickle_pairs")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=Sickle_pairs)) +
  geom_jitter(alpha=0.5) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 6e07, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Library Size") # library size is QC-filtered reads
## `summarise()` has grouped output by 'SampleID'. You can override using the `.groups` argument.
## `geom_smooth()` using formula 'y ~ x'

Microbial Reads

gardCladeAbU %>%
  spread(var, relativeAbundance) %>%
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  gather("var", "abundance", clades) %>%
  group_by(SampleID, Cohort) %>%
  summarize(nClades=sum(abundance)) %>%
  left_join(sgDF[,c("SampleID", "bacLoad", "bbmapFiltered_pairs")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=bbmapFiltered_pairs)) +
  geom_jitter(alpha=0.5) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 2e07, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Microbial Reads")

Assess microbial load by clades per sample after rarefying

allUnchGardSamples <- gardCladeAbU %>%
  filter(var=="Uncharacterized_Gardnerella",
         relativeAbundance>0) %>%
  .$SampleID
#write_lines(allUnchGardSamples, "/Users/hlberman/Desktop/unchGardSamples.txt")

sgDF %>%
  mutate(isUnch=case_when(SampleID %in% allUnchGardSamples~TRUE,
                          SampleID %!in% allUnchGardSamples~FALSE)) %>%
  ggplot(aes(x=isUnch, y=bbmapFiltered_pairs, color=Cohort, shape=Cohort)) +
  geom_boxplot(alpha=0, position = position_dodge(width=1)) +
  geom_quasirandom(alpha=0.5, dodge.width = 1) +
  geom_hline(yintercept = 1e05, linetype=2, color="black") +
  geom_hline(yintercept = 16334, linetype=2, color="gray") +
  scale_y_log10() +
  labs(x="Uncharacterized Gardnerella", y="Microbial Reads")

minReads <- min(sgDF$bbmapFiltered_pairs) 
minReads
## [1] 16334

Rarefy to 100,000 reads per sample

Assess read counts and get samples with 100,000 reads

rarefiedReadCountsDF <- "./rarefied_gard_counts/rarefiedReadCounts.csv" %>%
  read_csv 

k100Samples0 <- rarefiedReadCountsDF %>%
  filter(rarefiedReads==100000) %>%
  .$SampleID

# add names for samples in MOMS-PI Enriched
k100Samples <- k100Samples0[k100Samples0 %in% HMP2HighGardSamples] %>%
  paste0(., "_E") %>%
  c(k100Samples0, .)

Import MetaPhlAn tables

rarefiedMetaphlanOuts <- "./rarefied_gard_counts/rarefiedMergedMetaphlanAbundance.tsv" %>%
  read_tsv %>%
  filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
  mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -ID) %>%
  #as.data.frame() %>%
  dplyr::rename_at(2:ncol(.), ~str_extract(.x, "[0-9]+")) %>%
  column_to_rownames(var="Species") %>%
  t %>%
  as.data.frame() %>%
  mutate_all(funs(./100)) %>%
  rownames_to_column(var="SampleID")
  
rarefiedGardProportions <- rarefiedMetaphlanOuts %>%
   select(SampleID, Gardnerella_vaginalis) %>%
   replace_na(list(Gardnerella_vaginalis=0))

Proportion of clades

rarefiedCladeRelativeAbundance <- "./rarefied_gard_counts/gardCladeRelativeAbundance.tsv" %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID)) %>%
  select(-propClade) %>%
  filter(n>=2) %>% #count as present if >=2 reads are mapped to that clade
  group_by(SampleID) %>%
  mutate(propClade=n/sum(n)) %>%
  ungroup() %>%
  select(-n)
rarefiedGardDF0 <- rarefiedCladeRelativeAbundance %>%
  full_join(rarefiedGardProportions) %>%
  mutate(relativeAbundance=propClade*Gardnerella_vaginalis) %>%
  select(-propClade) %>%
  spread(Clade, relativeAbundance) %>%
  select(-`<NA>`) %>%
  replace_na(list(C1=0, C2=0, C3=0, C4=0, C5=0, C6=0)) %>%
  right_join(sgDF[,c("SampleID", "SubjectID", "Cohort", "TermPre")]) %>%
  mutate(Uncharacterized_Gardnerella=case_when(Gardnerella_vaginalis>0&(C1+C2+C3+C4+C5+C6==0)~Gardnerella_vaginalis,
                                               Gardnerella_vaginalis==0~0,
                                               Gardnerella_vaginalis>0&(C1+C2+C3+C4+C5+C6>0)~0))

# create rows for MOMS-PI Enriched Samples
rarefiedGardDF <- rarefiedGardDF0 %>%
  filter(SubjectID %in% HMP2highGardSubjects) %>% 
  mutate(Cohort="MOMS-PI Enriched",
         SampleID=paste0(SampleID, "_E"),
         SubjectID=paste0(SubjectID, "_E")) %>%
  full_join(rarefiedGardDF0)

Figure S6: Microbial load vs. rarefied clade counts

loadVrarefiedClades <- rarefiedGardDF %>%
  filter(SampleID %in% k100Samples) %>% 
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  mutate_at(clades, ~case_when(.x>0~1,
                               .x==0~0)) %>%
  mutate(nClades=(C1+C2+C3+C4+C5+C6), 
         Cohort=factor(Cohort, cohorts)) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  ggplot(aes(x=nClades, y=bacLoad)) +
  geom_jitter(alpha=0.25) +
  geom_smooth(method="lm", se=FALSE, color="blue", linetype=1) +
  stat_cor(method = "spearman", alternative = "greater", cor.coef.name = c("rho"), label.y = 1.5, label.x = 0, label.sep = "\n") + 
  facet_wrap(~Cohort) +
  scale_x_continuous(breaks = seq(0,6,1)) +
  labs(x="Clades per Sample", y="Microbial Load")
loadVrarefiedClades

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS6_rarefiedCladesPerSampleVsBacLoad.png", sep = "_")))

Microbial load by taxa presence-absence

bacloadVsTaxWilcox <- mDFtoiClades %>%  
  gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobes)) %>%
  select(SampleID, Cohort, Taxon, relativeAbundance) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
                                      relativeAbundance>=0.001~"+"),
         relativeAbundance=factor(relativeAbundance, levels=c("+", "-")),
         alternative=case_when(Taxon %in% c("Gardnerella_vaginalis", clades, anaerobes)~"greater",
                               Taxon %in% lactos~"less"),
         Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = c(abbrevs)),
         Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI")), 
         x=paste0(Cohort, Taxon)) %>%
  filter(x!="UAB Enr.Lg") %>%
  select(-x) %>%
  group_by(alternative) %>%
  nest %>%
  mutate(data=map(data, ~group_by(.x, Cohort, Taxon))) %>%
  mutate(wilcox_test=map2(data, alternative, ~wilcox_test(data=.x, bacLoad~relativeAbundance, alternative=.y))) %>%
  select(wilcox_test) %>%
  unnest %>%
  adjust_pvalue(method = "BH") %>%
  mutate(p.star=case_when(p.adj>0.05~"",
                          p.adj<0.05&p.adj>0.01~"*",
                          p.adj<0.01&p.adj>0.001~"**",
                          p.adj<0.001~"***"))

Plot

#fig.width=7, fig.height=2
mDFtoiClades %>%  
  gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobes)) %>%
  select(SampleID, Cohort, Taxon, relativeAbundance) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
                                      relativeAbundance>=0.001~"+"),
         Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = c(abbrevs)),
         Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI"))) %>%
  ggplot(aes(x=relativeAbundance, y=bacLoad)) +
  geom_violin(aes(color=relativeAbundance, fill=relativeAbundance), alpha=0.5) +
  geom_text(data =bacloadVsTaxWilcox, aes(y=1, x=1.5, label=p.star), color="black", size=4, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(Cohort~Taxon) +
  scale_y_log10() +
  theme(text = element_text(size=15),
        axis.text = element_text(size = 12),
        legend.position = "none") +
  labs(x="", y="Microbial Load")

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS7_presAbsVsBacLoad.png", sep = "_")))

Figure 4: Gardnerella and increases in microbial load

Microbial load vs. clades per sample and short list microbial load vs. presence-absence

#fig.width=5, fig.height=1.5
bacLoadpresAbs <- mDFtoiClades %>%  
  gather("Taxon", "relativeAbundance", c(clades, Gardnerella_vaginalis, lactos, anaerobesSL)) %>%
  select(SampleID, Cohort, Taxon, relativeAbundance) %>%
  left_join(bacLoad, by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(relativeAbundance=case_when(relativeAbundance<0.001~"-",
                                      relativeAbundance>=0.001~"+"),
         Taxon=factor(Taxon, levels = c("Gardnerella_vaginalis", clades, lactos, anaerobesSL), labels = c(abbrevsSL)),
         Cohort=factor(Cohort, levels = cohorts, labels=c("Stan. Enr.", "UAB Enr.", "MP Enr.", "MOMS-PI"))) %>%
  ggplot(aes(x=relativeAbundance, y=bacLoad)) +
  geom_violin(aes(color=relativeAbundance, fill=relativeAbundance), alpha=0.5) +
  geom_text(data=subset(bacloadVsTaxWilcox, Taxon %in% abbrevsSL), aes(y=1, x=1.5, label=p.star), color="black", size=4, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(Cohort~Taxon) +
  scale_y_log10() +
  theme(text = element_text(size=15),
        axis.text = element_text(size = 12),
        strip.text.y = element_text(size=8),
        legend.position = "none") +
  labs(x="", y="Microbial Load")

plot_grid(loadVClades, bacLoadpresAbs, nrow = 1, rel_widths = c(1,2.5), labels = c("A", "B"), label_size = 15)

#ggsave(file.path(figureOut, paste(Sys.Date(), "Figure4_gardAndBacLoad.png", sep="_")))

Gardnerella and the vaginal microbiome

Relative abundance of common taxa

Get the dominant taxon in each sample for organizing – using clades. Define as the taxon with the greatest relative abundance. If no taxon has a relative abundance greater than 30%, specify as “no type”

dominantTax <- gardCladeAb %>%
  spread(Clade, relativeAbundance) %>%
  right_join(mDF, by=c("SampleID", "Cohort")) %>%
  select(-Gardnerella_vaginalis) %>%
  mutate_at(vars(clades), replace_na,  0) %>%
  gather("dominantTax", "abundance", !one_of(c("SampleID", "Cohort"))) %>%
  group_by(SampleID) %>%
  filter(abundance==max(abundance)) %>%
  mutate(dominantTax=case_when(abundance<.3~"No type",
                               abundance>=.3~dominantTax)) %>%
  select(-abundance) %>%
  ungroup

unique(dominantTax$dominantTax)
##  [1] "C1"                         "No type"                   
##  [3] "C2"                         "C3"                        
##  [5] "C4"                         "C5"                        
##  [7] "C6"                         "Corynebacterium_sp_HFH0082"
##  [9] "Bifidobacterium_longum"     "Atopobium_vaginae"         
## [11] "Bacteroides_fragilis"       "Prevotella_amnii"          
## [13] "Prevotella_bivia"           "Prevotella_timonensis"     
## [15] "Staphylococcus_epidermidis" "Lactobacillus_crispatus"   
## [17] "Lactobacillus_delbrueckii"  "Lactobacillus_gasseri"     
## [19] "Lactobacillus_iners"        "Lactobacillus_jensenii"    
## [21] "Escherichia_coli"           "Haemophilus_parainfluenzae"
## [23] "Mycoplasma_hominis"         "Lactobacillus_phage_Lv_1"  
## [25] "Alphapapillomavirus_9"      "Staphylococcus_aureus"     
## [27] "Lactobacillus_helveticus"   "Bradyrhizobium_sp_DFCI_1"  
## [29] "Enterobacter_cloacae"       "Alphapapillomavirus_11"    
## [31] "Alphapapillomavirus_14"     "Alphapapillomavirus_3"     
## [33] "Alphapapillomavirus_5"
length(unique(dominantTax$dominantTax))
## [1] 33

Create dataframe for order of dominant types to order in barplot figure

dominantTaxOrder <- data.frame(dominantTax=c(clades, lactos, "Atopobium_vaginae", "Prevotella_bivia", "Prevotella_amnii", "Prevotella_timonensis", "Mycoplasma_hominis", "Corynebacterium_sp_HFH0082", "Bifidobacterium_longum", "Bacteroides_fragilis", "Staphylococcus_epidermidis", "Lactobacillus_delbrueckii", "Escherichia_coli", "Haemophilus_parainfluenzae", "Lactobacillus_phage_Lv_1", "Alphapapillomavirus_9", "Staphylococcus_aureus", "Lactobacillus_helveticus", "Bradyrhizobium_sp_DFCI_1", "Enterobacter_cloacae", "Alphapapillomavirus_11", "Alphapapillomavirus_14", "Alphapapillomavirus_3", "Alphapapillomavirus_5", "No type"),
                               dominantTaxOrder=c(1:33))

Stacked bar plot

# Ordered by dominant taxon 
taxaBarPlot <- mDFtoiClades %>%
  select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
  filter(SampleID!="6744034", 
         SampleID!="6744034_E") %>% # this sample is 100% uncharacterized gard and therefore cannot be displayed in this figure
  mutate(Other=1-C1-C2-C3-C4-C5-C6-Lactobacillus_crispatus-Lactobacillus_gasseri-Lactobacillus_jensenii-Lactobacillus_iners-Atopobium_vaginae-Mycoplasma_hominis-Prevotella_bivia-Ureaplasma_parvum) %>%
  gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
  mutate(relativeAbundance=na_if(relativeAbundance, 0),
         Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL))) %>%
  left_join(dominantTax, by=c("SampleID", "Cohort")) %>%
  left_join(dominantTaxOrder, by="dominantTax") %>%
  group_by(SampleID) %>%
  ggplot(aes(x=reorder(reorder(SampleID, relativeAbundance, na.rm=TRUE), dominantTaxOrder, FUN=min), y=relativeAbundance, color=Taxon, fill=Taxon)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  facet_wrap(~Cohort, scales = "free") +
  labs(x="Sample", y="Relative Abundance") +
  taxColorsOther
taxaBarPlot   

# ordered by microbial load
mDFtoiClades %>%
  select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
  mutate(Other=1-(C1+C2+C3+C4+C5+C6+Lactobacillus_crispatus+Lactobacillus_gasseri+Lactobacillus_jensenii+Lactobacillus_iners+Atopobium_vaginae+Mycoplasma_hominis+Prevotella_bivia+Ureaplasma_parvum)) %>%
  gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
  left_join(bacLoad, by="SampleID") %>%
  mutate(Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL)),
         absoluteAbundance=relativeAbundance*bacLoad) %>%
  ggplot(aes(x=reorder(SampleID, absoluteAbundance, FUN=sum), y=relativeAbundance, color=Taxon, fill=Taxon)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  facet_wrap(~Cohort, scales="free_x") +
  labs(x="Sample", y="Relative Abundance") +
  taxColorsOther

Absolute abundance of common taxa

# By absolute abundance
aaBarPlot <- mDFtoiClades %>%
  select(SampleID, Cohort, clades, lactos, anaerobesSL) %>%
  mutate(Other=1-C1-C2-C3-C4-C5-C6-Lactobacillus_crispatus-Lactobacillus_gasseri-Lactobacillus_jensenii-Lactobacillus_iners-Atopobium_vaginae-Mycoplasma_hominis-Prevotella_bivia-Ureaplasma_parvum) %>%
  gather("Taxon", "relativeAbundance", !contains(c("SampleID","Cohort"))) %>%
  left_join(bacLoad, by="SampleID") %>%
  mutate(Taxon=factor(Taxon, levels=c("Other", clades, lactos, anaerobesSL)),
         absoluteAbundance=relativeAbundance*bacLoad) %>%
  left_join(bacLoad, by="SampleID") %>%
  ggplot(aes(x=reorder(SampleID, absoluteAbundance, FUN=sum), y=absoluteAbundance, color=Taxon, fill=Taxon)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  facet_wrap(~Cohort, scales="free_x") +
  labs(x="Sample", y="Absolute Abundance") +
  taxColorsOther
aaBarPlot

aaBarPlot +
    coord_cartesian(ylim=c(0,1.5))

What is in the outlier samples?

outlierSamples <- bacLoad %>%
  filter(bacLoad > 2) %>%
  .$SampleID
outlierSamples
## [1] "4009835178" "1005601348" "6744677"    "6743991"    "6744354"   
## [6] "6744871"    "6744459"    "6744677_E"  "6744871_E"

Get top 5 taxa from each outlier sample

mDF %>%
  filter(SampleID %in% outlierSamples) %>%
  gather("Taxon", "Abundance", !contains(c("SampleID", "Cohort"))) %>%
  group_by(SampleID) %>%
  top_n(5) %>%
  arrange(-Abundance) %>%
  split(.$SampleID)
## $`1005601348`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID   Cohort            Taxon                          Abundance
##   <chr>      <fct>             <chr>                              <dbl>
## 1 1005601348 Stanford Enriched Prevotella_timonensis             0.300 
## 2 1005601348 Stanford Enriched Gardnerella_vaginalis             0.196 
## 3 1005601348 Stanford Enriched Lactobacillus_gasseri             0.106 
## 4 1005601348 Stanford Enriched Varibaculum_cambriense            0.0689
## 5 1005601348 Stanford Enriched Clostridiales_bacterium_BV3C26    0.0526
## 
## $`4009835178`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID   Cohort       Taxon                 Abundance
##   <chr>      <fct>        <chr>                     <dbl>
## 1 4009835178 UAB Enriched Gardnerella_vaginalis    0.243 
## 2 4009835178 UAB Enriched Atopobium_vaginae        0.142 
## 3 4009835178 UAB Enriched Prevotella_timonensis    0.0688
## 4 4009835178 UAB Enriched Finegoldia_magna         0.0601
## 5 4009835178 UAB Enriched Prevotella_disiens       0.0405
## 
## $`6743991`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                         Abundance
##   <chr>    <fct>   <chr>                             <dbl>
## 1 6743991  MOMS-PI Prevotella_timonensis            0.185 
## 2 6743991  MOMS-PI Porphyromonas_bennonis           0.168 
## 3 6743991  MOMS-PI Porphyromonas_asaccharolytica    0.118 
## 4 6743991  MOMS-PI Peptoniphilus_sp_BV3AC2          0.0732
## 5 6743991  MOMS-PI Porphyromonas_uenonis            0.0645
## 
## $`6744354`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                        Abundance
##   <chr>    <fct>   <chr>                            <dbl>
## 1 6744354  MOMS-PI Prevotella_amnii                0.405 
## 2 6744354  MOMS-PI Gardnerella_vaginalis           0.187 
## 3 6744354  MOMS-PI Prevotella_timonensis           0.163 
## 4 6744354  MOMS-PI Porphyromonas_uenonis           0.0635
## 5 6744354  MOMS-PI Clostridiales_genomosp_BVAB3    0.0529
## 
## $`6744459`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                   Abundance
##   <chr>    <fct>   <chr>                       <dbl>
## 1 6744459  MOMS-PI Prevotella_timonensis      0.147 
## 2 6744459  MOMS-PI Oligella_urethralis        0.135 
## 3 6744459  MOMS-PI Bifidobacterium_breve      0.0911
## 4 6744459  MOMS-PI Peptoniphilus_sp_BV3AC2    0.0813
## 5 6744459  MOMS-PI Gardnerella_vaginalis      0.0658
## 
## $`6744677`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                       Abundance
##   <chr>    <fct>   <chr>                           <dbl>
## 1 6744677  MOMS-PI Gardnerella_vaginalis          0.733 
## 2 6744677  MOMS-PI Prevotella_amnii               0.172 
## 3 6744677  MOMS-PI Megasphaera_genomosp_type_1    0.0398
## 4 6744677  MOMS-PI Atopobium_vaginae              0.0184
## 5 6744677  MOMS-PI Lactobacillus_iners            0.0155
## 
## $`6744677_E`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID  Cohort           Taxon                       Abundance
##   <chr>     <fct>            <chr>                           <dbl>
## 1 6744677_E MOMS-PI Enriched Gardnerella_vaginalis          0.733 
## 2 6744677_E MOMS-PI Enriched Prevotella_amnii               0.172 
## 3 6744677_E MOMS-PI Enriched Megasphaera_genomosp_type_1    0.0398
## 4 6744677_E MOMS-PI Enriched Atopobium_vaginae              0.0184
## 5 6744677_E MOMS-PI Enriched Lactobacillus_iners            0.0155
## 
## $`6744871`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID Cohort  Taxon                       Abundance
##   <chr>    <fct>   <chr>                           <dbl>
## 1 6744871  MOMS-PI Gardnerella_vaginalis          0.794 
## 2 6744871  MOMS-PI Megasphaera_genomosp_type_1    0.0679
## 3 6744871  MOMS-PI Dialister_micraerophilus       0.0357
## 4 6744871  MOMS-PI Prevotella_amnii               0.0324
## 5 6744871  MOMS-PI Atopobium_vaginae              0.0307
## 
## $`6744871_E`
## # A tibble: 5 × 4
## # Groups:   SampleID [1]
##   SampleID  Cohort           Taxon                       Abundance
##   <chr>     <fct>            <chr>                           <dbl>
## 1 6744871_E MOMS-PI Enriched Gardnerella_vaginalis          0.794 
## 2 6744871_E MOMS-PI Enriched Megasphaera_genomosp_type_1    0.0679
## 3 6744871_E MOMS-PI Enriched Dialister_micraerophilus       0.0357
## 4 6744871_E MOMS-PI Enriched Prevotella_amnii               0.0324
## 5 6744871_E MOMS-PI Enriched Atopobium_vaginae              0.0307

Many have Gardnerella, all have Prevotella timonensis or P. amnii

Co occurrence with Jaccard distance

All samples

Long List

#fig.height=5, fig.width=6.5
jaccards <- mDFtoiClades %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobes) %>%
  mutate_if(is.numeric, ~case_when(.x>=0.001~1, 
                                   .x<0.001~0)) %>%
  split(.$Cohort) %>%
  map(~column_to_rownames(.x, var="SampleID")) %>%
  map(~select(.x, -Cohort)) %>%
  map(t) %>%
  map(~vegdist(.x, method="jaccard")) %>%
  map(as.matrix)

jaccards$`Stanford Enriched`[lower.tri(jaccards$`Stanford Enriched`, diag = TRUE)] <- NA
jaccards$`UAB Enriched`[lower.tri(jaccards$`UAB Enriched`, diag = TRUE)] <- NA
jaccards$`MOMS-PI Enriched`[lower.tri(jaccards$`MOMS-PI Enriched`, diag = TRUE)] <- NA
jaccards$`MOMS-PI`[lower.tri(jaccards$`MOMS-PI`, diag = TRUE)] <- NA

jaccards %>%
  map(melt) %>%
  map2(names(.), ~mutate(.x, cohort=.y)) %>%
  purrr::reduce(full_join) %>%
  mutate(Var1=factor(Var1, levels=c("Gardnerella_vaginalis", clades, lactos, anaerobes), labels = abbrevs), 
         Var2=factor(Var2, levels=rev(c("Gardnerella_vaginalis", clades, lactos, anaerobes)), labels = rev(abbrevs)),
         cohort=factor(cohort, levels = cohorts)) %>%
  filter(!(Var1=="TG" & Var2 %in% clades),
         !is.na(value)) %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_x_discrete(drop=FALSE) +
  scale_y_discrete(drop=FALSE) +
  scale_fill_gradient2(low = "#56B4E9", mid = "gray", high = "#D55E00", midpoint = 0.5) +
  theme(axis.text.x = element_text(angle=45, hjust=1), 
        strip.text.x = element_text(size = 13)) +
  facet_wrap(~cohort) +
  labs(x="", y="", fill="Jaccard Distance")

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS8_jacFourCohort.png", sep="_")), height=6, width=7.5, units="in")

Figure 5: Gardnerella impacts on signatures of the vaginal microbiome

Taxon Abundances and short list of co-occurrences

#fig.width=5.25, fig.height=2
jaccardsSL <- mDFtoiClades %>%
  select(SampleID, Cohort, Gardnerella_vaginalis, clades, lactos, anaerobesSL) %>%
  mutate_if(is.numeric, ~case_when(.x>=0.001~1, 
                                   .x<0.001~0)) %>%
  split(.$Cohort) %>%
  map(~column_to_rownames(.x, var="SampleID")) %>%
  map(~select(.x, -Cohort)) %>%
  map(t) %>%
  map(~vegdist(.x, method="jaccard")) %>%
  map(as.matrix)

jaccardsSL$`Stanford Enriched`[lower.tri(jaccardsSL$`Stanford Enriched`, diag = TRUE)] <- NA
jaccardsSL$`UAB Enriched`[lower.tri(jaccardsSL$`UAB Enriched`, diag = TRUE)] <- NA
jaccardsSL$`MOMS-PI Enriched`[lower.tri(jaccardsSL$`MOMS-PI Enriched`, diag = TRUE)] <- NA
jaccardsSL$`MOMS-PI`[lower.tri(jaccardsSL$`MOMS-PI`, diag = TRUE)] <- NA

jaccardFigure <- jaccardsSL %>%
  map(melt) %>%
  map2(names(.), ~mutate(.x, cohort=.y)) %>%
  purrr::reduce(full_join) %>%
  mutate(Var1=factor(Var1, levels=c("Gardnerella_vaginalis", clades, lactos, anaerobesSL), labels = abbrevsSL), 
         Var2=factor(Var2, levels=rev(c("Gardnerella_vaginalis", clades, lactos, anaerobesSL)), labels = rev(abbrevsSL)),
         cohort=factor(cohort, levels=cohorts)) %>%
  filter(!(Var1=="TG" & Var2 %in% clades),
         !is.na(value)) %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_x_discrete(drop=FALSE) +
  scale_y_discrete(drop=FALSE) +
  scale_fill_gradient2(low = "#56B4E9", mid = "gray", high = "#D55E00", midpoint = 0.5) +
  theme(axis.text.x = element_text(angle=45, hjust=1, size=7), 
        axis.text.y = element_text(size=7)) +
  facet_wrap(~cohort) +
  coord_fixed() +
  labs(x="", y="", fill="Jaccard Distance")

plot_grid(taxaBarPlot, jaccardFigure, nrow=1, labels = c("A", "B"), label_size = 15)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "Figure5_microbiomeAndJaccard.png", sep="_")))

Gardnerella variants by PTB status

Clade abundance by PTB status

Relative Abundance

#plot showing clade relative abundance in term vs. preterm birth in all cohorts
allCohortsCladeRelativePTBplot <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella") %>% #SampleID %!in% unchGardSamples,
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
  mutate(var=factor(var, levels = clades, labels = clades),
         Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  binaryColors +
  facet_grid(Cohort~var, scales="free_y") +
  theme(legend.position = "none") +
  labs(x=NULL, y="Relative Abundance")

allCohortsCladeRelativePTBplot

  1. Compute differences between means for each subject and transform
# create function to mutate dataframe to create differences and then observe transformations
meanDifferenceTransformations <- function(inputDF, vars){

  #get list of pairs
  pairs <- combn(vars, 2) %>%
    t %>%
    as.data.frame %>%
    mutate(pairs=paste(V1, V2, sep="-")) %>%
    .$pairs
  
  # calculate differences and transformations of values
  diffs_trans <- inputDF %>%
    mutate(meanAbs=set_names(meanAbs, var)) %>%
    nest(diffs=c(var, meanAbs)) %>%
    mutate(diffs=map(diffs, ~abs(outer(.x$meanAbs, .x$meanAbs, `-`)))) %>% #compute differences
    mutate(diffs=map(diffs, data.frame)) %>%
    mutate(diffs=map(diffs, ~rownames_to_column(.x, var="var1"))) %>%
    mutate(diffs=map(diffs, ~gather(.x, "var2", "diff", vars))) %>%
    mutate(diffs=map(diffs, ~transmute(.x, pair=paste(var1, var2, sep="-"), diff=diff))) %>%
    mutate(diffs=map(diffs, ~filter(.x, pair %in% pairs))) %>% # filter repeats
    mutate(diffs=map(diffs, ~mutate(.x, sqrt=sqrt(diff), #square root transformation
                                          frthrt=diff^(1/4), #fourth root transformation
                                          log=log10(diff + 1e-5)))) %>% #log10 transformation with pseudocount
    unnest
  
  # plot histograms
  plots <- diffs_trans %>%
    gather("Transformation", "Value", c(diff, sqrt, frthrt, log)) %>%
    mutate(Transformation=factor(Transformation, levels=c("diff", "sqrt", "frthrt", "log"), labels=c("Difference", "Square Root", "Fourth Root", "log10 with Pseudocount"))) %>%
    ggplot(aes(x=Value)) +
    geom_histogram() +
    facet_wrap(~Transformation, scales="free")
  
  if(vars==clades){
  # perform shapiro-wilk test for normality. null hypothesis is normality.
  shapiro_results <- map(list(Difference=diffs_trans$diff, sqrt=diffs_trans$sqrt, fourthrt=diffs_trans$frthrt, log=diffs_trans$log), shapiro_test) %>% 
      map2(., names(.), ~mutate(.x, transformation=.y)) %>%
      purrr::reduce(full_join) %>%
      select(transformation, statistic, p.value)
  return(list(diffs_trans=diffs_trans, plots=plots, shapiro_results=shapiro_results))
  }
  
  if(vars==genomos){
    return(list(diffs_trans=diffs_trans, plots=plots))
  }

 }

Assess and choose transformation

cladeRelativeTransfs <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Uncharacterized_Gardnerella", var!="Gardnerella_vaginalis") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, meanAbs=mean(relativeAbundance)) %>%
  meanDifferenceTransformations(., clades)

cladeRelativeTransfs$plots

cladeRelativeTransfs$shapiro_results %>%
  formattable()
transformation statistic p.value
Difference 0.6367996 3.208264e-63
sqrt 0.8608028 1.845471e-46
fourthrt 0.9725260 3.774719e-24
log 0.8951225 5.074615e-42

Choose fourth root transformation

  1. Perform logistic regression on mean differences to see if they vary by term or preterm birth Null hypothesis is that coefficients = 0 Model: \(y_{ij}=\beta_{0i} + x_{ij}\beta_{1ij}+e{ij}\)
# function for performing logistic regression on mean differences of clade abundances
meanDiffLogisticRegression <- function(inputDF, transformation){
  # get cohort Ns
  coh_ns <- gardCladeAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(!is.na(TermPre), var!="Uncharacterized_Gardnerella", var!="Gardnerella_vaginalis") %>%
    group_by(Cohort) %>%
    summarize(cohort_n=n_distinct(SubjectID))
  
  # perform logitic regressions tests
  logiOuts <- inputDF %>%
    gather("transformation", "difference", c(diff, sqrt, frthrt, log)) %>% # filter to choose values with appropriate transformation
    filter(transformation==transformation) %>%
    select(Cohort, TermPre, pair, difference) %>%
    mutate(TermPre=case_when(TermPre=="T"~0,
                             TermPre=="P"~1)) %>%
    split(list(.$Cohort, .$pair)) %>%
    map(~glm(data=.x, TermPre~difference, family="binomial")) %>%
    map(tidy) %>% 
    map2(., names(.), ~mutate(.x, Cohort_Pair=.y)) %>%
    map(~separate(.x, Cohort_Pair, c("Cohort", "Pair"), sep="\\.")) %>%
    purrr::reduce(full_join, by = c("term", "estimate", "std.error", "statistic", "p.value", "Cohort", "Pair")) %>%
    left_join(coh_ns, by="Cohort") %>%
    mutate(std.dev=std.error*sqrt(cohort_n))
  
  logiPlots <- logiOuts %>%
    filter(term=="difference") %>%
    mutate(Cohort=factor(Cohort, levels=cohorts, labels = cohortAbbrevs),
           Pair=factor(Pair, levels = rev(unique(logiOuts$Pair)))) %>%
    ggplot(aes(y=Pair, x=estimate, xmin=estimate-std.dev, xmax=estimate+std.dev)) +
    geom_pointrange() +
    geom_vline(aes(xintercept=0), linetype=2, color="gray", inherit.aes=FALSE) +
    facet_grid(~Cohort) 
  
  return(list(logiOuts=logiOuts, logiPlots=logiPlots))
}
cladeRelMeanDiffRegression <- meanDiffLogisticRegression(cladeRelativeTransfs$diffs_trans, "frthrt")

cladeRelMeanDiffRegression$logiOuts %>%
  filter(term=="difference",
         p.value<0.05) %>%
        nrow
## [1] 0

Figure S10: All cohorts Gardnerella and PTB logistic regression

#fig.height=2, fig.width=5
plot_grid(allCohortsCladeRelativePTBplot, cladeRelMeanDiffRegression$logiPlots, nrow=1, rel_widths = c(1, 0.75), align = "v", labels = c("A", "B"), label_size = 15)

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS10_allCladesLogisticRegressionPTB.png", sep="_")))

Absolute Abundance

Plot abundances

#plot showing clade absolute abundance in term vs. preterm birth in all cohorts
allCohortsCladeAbsolutePTBplot <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella", bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = clades, labels = clades),
         Cohort=factor(Cohort, levels=cohorts, labels=cohortAbbrevs)) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  binaryColors +
  facet_grid(Cohort~var, scales="free_y") +
  theme(legend.position = "none") +
  labs(x=NULL, y="Absolute Abundance")

allCohortsCladeAbsolutePTBplot

Assess and choose transformation

cladeAbsoluteTransfs <- gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(!is.na(TermPre), var!="Gardnerella_vaginalis", var!="Uncharacterized_Gardnerella", bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, meanAbs=mean(absoluteAbundance)) %>%
  meanDifferenceTransformations(., clades)

cladeAbsoluteTransfs$plots

cladeAbsoluteTransfs$shapiro_results %>%
  formattable()
transformation statistic p.value
Difference 0.4769455 2.720409e-70
sqrt 0.7729894 1.124654e-54
fourthrt 0.9456048 1.289173e-32
log 0.9506172 2.456043e-31

Choose log10 + pseudocount transformation

Perform logistic regression on mean differences to see if they vary by term or preterm birth Null hypothesis is that coefficients = 0

cladeAbsMeanDiffRegression <- meanDiffLogisticRegression(cladeAbsoluteTransfs$diffs_trans, "log")

cladeAbsMeanDiffRegression$logiOuts %>%
  filter(term=="difference",
        p.value<0.05) %>%
  nrow
## [1] 0
#fig.height=2, fig.width=5
plot_grid(allCohortsCladeAbsolutePTBplot, cladeAbsMeanDiffRegression$logiPlots, nrow=1, rel_widths = c(1, 0.75), align = "v")

Gardnerella and PTB in MOMS-PI study Only

Due to enrichment in Gardnerella in Stanford and UAB clades. The following analyses will focus on the MOMS-PI cohorts. Create data frame:

momsptbDF <- gardCladeAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(str_detect(Cohort, "MOMS-PI"),
        !is.na(TermPre)) %>%
  mutate(Cohort=as.character(Cohort))

Variant abundance versus PTB

Clades

Wilcox Ranked Sum in MOMS-PI cohort ONLY

cladeRelativeWilcoxResults <- momsptbDF %>%
  filter(Cohort=="MOMS-PI",
         var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Relative=mean(relativeAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Relative~TermPre, alternative="greater")
  
cladeRelativeWilcoxResults %>%
  arrange(p) %>%
  formattable()
var .y. group1 group2 n1 n2 statistic p
Gardnerella_vaginalis Relative P T 43 90 2409.0 0.0114
C3 Relative P T 43 90 2279.0 0.0488
C1 Relative P T 43 90 2141.0 0.1610
C6 Relative P T 43 90 2080.0 0.2250
C5 Relative P T 43 90 2057.5 0.2730
C2 Relative P T 43 90 2009.0 0.3610
C4 Relative P T 43 90 1731.5 0.8370

Wilcoxon rank sum test: Absolute abundance

cladeAbsoluteWilcoxResults <- momsptbDF %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2,
         var!="Uncharacterized_Gardnerella") %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Absolute=mean(absoluteAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Absolute~TermPre, alternative="greater")
  
cladeAbsoluteWilcoxResults %>%
  arrange(p) %>%
  formattable()
var .y. group1 group2 n1 n2 statistic p
Gardnerella_vaginalis Absolute P T 43 90 2357.0 0.0213
C3 Absolute P T 43 90 2222.0 0.0835
C1 Absolute P T 43 90 2179.0 0.1200
C2 Absolute P T 43 90 2108.0 0.2010
C5 Absolute P T 43 90 2088.5 0.2250
C6 Absolute P T 43 90 2070.5 0.2390
C4 Absolute P T 43 90 1933.5 0.5040

Plot

# save p values
cladeWilcoxResults <- cladeRelativeWilcoxResults %>%
  full_join(cladeAbsoluteWilcoxResults, by = c("var", ".y.", "group1", "group2", "n1", "n2", "statistic", "p")) %>%
  mutate(p.star=case_when(p>0.05~"",
                          p<0.05&p>0.01~"*",
                          p<0.01&p>0.001~"**",
                          p<0.001~"***"),
         var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels = c(clades, "Total")))

Relative Abundance Plot:

momspiCladeAbundancePTBPlot <-  momsptbDF %>%
  filter(Cohort=="MOMS-PI",
         var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
  mutate(var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels = c(clades, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = cladeWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=13),
        strip.text.y = element_text(size=9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Relative\nAbundance")
momspiCladeAbundancePTBPlot

Absolute Abundance Plot:

momsptbDF %>%
  filter(Cohort=="MOMS-PI",
         var!="Uncharacterized_Gardnerella") %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = c(clades,  "Gardnerella_vaginalis"), labels = c(clades, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = cladeWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=13),
        strip.text.y = element_text(size=9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Absolute Abundance")

Genomospecies

Wilcoxon rank sum test for genomospecies, MOMS-PI cohort ONLY

# Relative abundance
genomoRelativeWilcoxResults <- gardGenomoAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(Cohort=="MOMS-PI",
           !is.na(TermPre),
           var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Relative=mean(relativeAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Relative~TermPre, alternative="greater")
  

# Absolute abundance
genomoAbsoluteWilcoxResults <- gardGenomoAbU %>%
    left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
    filter(Cohort=="MOMS-PI",
           !is.na(TermPre),
           var!="Uncharacterized_Gardnerella") %>%
  mutate(Cohort=as.character(Cohort)) %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, Absolute=mean(absoluteAbundance)) %>%
  group_by(var) %>%
  wilcox_test(Absolute~TermPre, alternative="greater")
  
genomoRelativeWilcoxResults %>%
  arrange(p) %>%
  formattable
var .y. group1 group2 n1 n2 statistic p
GS10 Relative P T 43 90 2498.0 0.000208
GS9 Relative P T 43 90 2524.0 0.000298
GS8 Relative P T 43 90 2445.0 0.001940
GS14 Relative P T 43 90 2368.0 0.011100
Gardnerella_vaginalis Relative P T 43 90 2409.0 0.011400
GS2 Relative P T 43 90 2207.0 0.084200
GS12 Relative P T 43 90 2127.0 0.160000
GS13 Relative P T 43 90 2078.0 0.208000
GS11 Relative P T 43 90 2023.0 0.277000
GS7 Relative P T 43 90 2023.5 0.321000
GS1 Relative P T 43 90 2019.0 0.341000
GS4 Relative P T 43 90 1971.0 0.426000
GS6 Relative P T 43 90 1969.0 0.436000
GS5 Relative P T 43 90 1913.0 0.544000
GS3 Relative P T 43 90 1869.0 0.634000
genomoAbsoluteWilcoxResults %>%
  arrange(p) %>%
  formattable
var .y. group1 group2 n1 n2 statistic p
GS10 Absolute P T 43 90 2502.0 0.000189
GS9 Absolute P T 43 90 2514.0 0.000369
GS8 Absolute P T 43 90 2451.0 0.001630
GS14 Absolute P T 43 90 2376.0 0.009930
Gardnerella_vaginalis Absolute P T 43 90 2357.0 0.021300
GS2 Absolute P T 43 90 2181.0 0.106000
GS12 Absolute P T 43 90 2146.0 0.137000
GS13 Absolute P T 43 90 2070.0 0.221000
GS6 Absolute P T 43 90 2081.5 0.240000
GS11 Absolute P T 43 90 2026.0 0.271000
GS1 Absolute P T 43 90 2045.0 0.296000
GS7 Absolute P T 43 90 2011.5 0.344000
GS4 Absolute P T 43 90 1977.0 0.414000
GS5 Absolute P T 43 90 1973.0 0.427000
GS3 Absolute P T 43 90 1902.0 0.568000
genomoWilcoxResults <- genomoRelativeWilcoxResults %>%
  full_join(genomoAbsoluteWilcoxResults, by = c("var", ".y.", "group1", "group2", "n1", "n2", "statistic", "p")) %>%
  mutate(p.star=case_when(p>0.05~"",
                          p<0.05&p>0.01~"*",
                          p<0.01&p>0.001~"**",
                          p<0.001~"***"),
         var=factor(var, levels = c(genomosCladeOrder,  "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total")))

Relative Abundance Plot:

momspiGenomoAbundancePTBPlot <-gardGenomoAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre),
         var!="Uncharacterized_Gardnerella") %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(relativeAbundance)) %>%
  mutate(var=factor(var, levels = c(genomosCladeOrder,  "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = genomoWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=8),
        strip.text.y = element_text(size = 9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Relative\nAbundance")
momspiGenomoAbundancePTBPlot

Absolute Abundance Plot:

gardGenomoAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre), bacLoad<2,
         var!="Uncharacterized_Gardnerella") %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  with_groups(c(SubjectID, Cohort, TermPre, var), summarize, abundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = c(genomosCladeOrder,  "Gardnerella_vaginalis"), labels = c(genomoNamesCladeOrder, "Total"))) %>%
  ggplot(aes(x=TermPre, y=abundance, color=TermPre, fill=TermPre)) +
  geom_quasirandom(alpha=0.5) +
  geom_boxplot(color="black", alpha=0) +
  geom_text(data = genomoWilcoxResults, aes(y=0.6, x=1.5, position="dodge", label=p.star), color="black", size=6, inherit.aes = FALSE) +
  binaryColors +
  facet_grid(.~var) +
  theme(legend.position = "none",
        strip.text.x = element_text(size=8),
        strip.text.y = element_text(size = 9),
        axis.title=element_text(size=16)) +
  labs(x=NULL, y="Absolute Abundance")

Number of clades by PTB

Mean clades per sample in term vs. preterm birth patients in MOMS-PI Only. Calculate Wilcoxon rank sum test

nCladesWilcox <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>=0.001~1,
                               .x<0.001~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  wilcox_test(meanNClades~TermPre, alternative="greater")

nCladesWilcox %>%
  formattable()
.y. group1 group2 n1 n2 statistic p
meanNClades P T 43 90 2301.5 0.0389

Plot

momspiCladeCountPlot <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  spread(var, relativeAbundance) %>%
  mutate_at(clades, ~case_when(.x>=0.001~1,
                               .x<0.001~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  ggplot(aes(x=TermPre, y=meanNClades, color=TermPre)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 3, show.legend = FALSE) +
  ylim(0,7) +
  binaryColors +
  labs(x=NULL, y="Mean Gestational\nClade Count") +
  stat_pvalue_manual(nCladesWilcox, label="p", y.position = 6.7)
momspiCladeCountPlot

Assess mean number of clades of rarefying

Perform Wilcoxon Rank Sum test

rarefiedNCladesWilcox <- rarefiedGardDF %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre), SampleID %in% k100Samples) %>% 
  mutate_at(clades, ~case_when(.x>=0.001~1,
                               .x<0.001~0)) %>%
  mutate(nClades=C1+C2+C3+C4+C5+C6) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  wilcox_test(meanNClades~TermPre, alternative = "greater")

rarefiedNCladesWilcox %>%
  formattable()  
.y. group1 group2 n1 n2 statistic p
meanNClades P T 42 89 2189.5 0.0571

Figure S9: Mean gestational clade richness vs. PTB after rarefying

rarefiedGardDF %>%
  filter(Cohort=="MOMS-PI", !is.na(TermPre), SampleID %in% k100Samples) %>% 
  select(-Gardnerella_vaginalis, -Uncharacterized_Gardnerella) %>%
  gather("Clade", "Abundance", c(C1, C2, C3, C4, C5, C6)) %>%
  mutate(Abundance=case_when(Abundance>=0.001~1,
                             Abundance<0.001~0)) %>%
  with_groups(c(SampleID, SubjectID, Cohort, TermPre), summarize, nClades=sum(Abundance)) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanNClades=mean(nClades)) %>%
  ggplot(aes(x=TermPre, y=meanNClades, color=TermPre)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 3, show.legend = FALSE) +
  ylim(0,7) +
  binaryColors +
  labs(x=NULL, y="Mean Gestational\nClade Count") +
  stat_pvalue_manual(rarefiedNCladesWilcox, label="p", y.position = 6.7)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "FigureS9_rarefiedCladeRichnessvsPTB.png", sep="_")))

Microbial load by PTB

Perform Wilcoxon Rank Sum tests

bacloadWilcox <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanBacLoad=mean(bacLoad)) %>%
  wilcox_test(meanBacLoad~TermPre, alternative="greater")

bacloadWilcox %>%
  formattable
.y. group1 group2 n1 n2 statistic p
meanBacLoad P T 43 90 2436 0.00803

Plot results

momspiBacLoadPTBplot <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "bacLoad")], by="SampleID") %>%
  filter(Cohort=="MOMS-PI", bacLoad<2) %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanBacLoad=mean(bacLoad)) %>%
  ggplot(aes(x=TermPre, color=TermPre, y=meanBacLoad)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 2.1, show.legend = FALSE) +
  ylim(0,0.75) +
  binaryColors +
  labs(x=NULL, y="Mean Gestational\nMicrobial Load") +
  stat_pvalue_manual(bacloadWilcox, label="p", y.position = 0.7)

Figure 6: MOMS-PI Cohort Gardnerella and PTB

#fig.height=3, fig.width=5
momspiPTBAbundancePlots <- plot_grid(momspiCladeAbundancePTBPlot, momspiGenomoAbundancePTBPlot, ncol=1, labels = c("A", "B"), label_size = 15)
momspiCladeCountBacLoadPTBPlots <- plot_grid(momspiCladeCountPlot, momspiBacLoadPTBplot, nrow=1, labels = c("C", "D"), label_size = 15)
plot_grid(momspiPTBAbundancePlots, momspiCladeCountBacLoadPTBPlots, ncol=1, rel_heights = c(1, 0.6))

#ggsave(file.path(figureOut, paste(Sys.Date(), "Figure6_momspiPTB.png", sep="_")))

Library Size by PTB

libSizeWilcox <- momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "Sickle_pairs")], by="SampleID") %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanLibSize=mean(Sickle_pairs)) %>%
  wilcox_test(meanLibSize~TermPre, alternative="greater")

libSizeWilcox %>%
  formattable
.y. group1 group2 n1 n2 statistic p
meanLibSize P T 43 90 2007 0.365
momsptbDF %>%
  filter(Cohort=="MOMS-PI") %>%
  left_join(sgDF[,c("SampleID", "Sickle_pairs")], by="SampleID") %>%
  with_groups(c(SubjectID, Cohort, TermPre), summarize, meanLibSize=mean(Sickle_pairs)) %>%
  ggplot(aes(x=TermPre, color=TermPre, y=meanLibSize)) +
  geom_boxplot(alpha=0, color="black", show.legend=FALSE) +
  geom_beeswarm(alpha=0.5, cex = 2.1, show.legend = FALSE) +
  binaryColors +
  theme(axis.text = element_text(size=13),
        strip.text = element_text(size=13),
        axis.title = element_text(size=16)) +
  labs(x=NULL, y="Mean Subject Library Size") +
  stat_pvalue_manual(libSizeWilcox, label="p", y.position = 3.5e+07)

Gardnerella Clades by Race

Subject average abundance by self reported race

gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "Race")], by="SampleID") %>%
  group_by(SubjectID, Cohort, Race, var) %>%
  summarise(meanRelativeAbundance=mean(relativeAbundance)) %>%
  filter(var!="Uncharacterized_Gardnerella") %>%
  mutate(var=factor(var, levels = c(clades, "Gardnerella_vaginalis"), labels = c(clades, "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs),
         Race=factor(Race, levels=c("Asian", "Black", "White", "Other", "Unknown"))) %>%
  ggplot(aes(x=Race, y=meanRelativeAbundance, color=Race, fill=Race)) +
  geom_point(alpha=0.5, position = position_jitterdodge()) +
  geom_boxplot(color="black", alpha=0, outlier.shape = 0) +
  facet_grid(Cohort~var) +
  scale_x_discrete(labels=c("A", "B", "W", "O")) +
  theme(legend.position = "bottom") +
  labs(x=NULL, y="Relative Abundance")

#ggsave(file.path(figureOut, paste(Sys.Date(), "FigureS11_cladeRelativeAbundanceRace.png", sep="_")))

Subject average absolute abundance by race

gardCladeAbU %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "Race", "bacLoad")], by="SampleID") %>%
  filter(bacLoad<2) %>%
  mutate(absoluteAbundance=relativeAbundance*bacLoad) %>%
  group_by(SubjectID, Cohort, Race, var) %>%
  summarise(meanAbsoluteAbundance=mean(absoluteAbundance)) %>%
  mutate(var=factor(var, levels = c(clades, "Uncharacterized_Gardnerella",  "Gardnerella_vaginalis"), labels = c(clades, "Unch.", "Total")),
         Cohort=factor(Cohort, levels = cohorts, labels=cohortAbbrevs),
         Race=factor(Race, levels=c("Asian", "Black", "White", "Other", "Unknown"))) %>%
  ggplot(aes(x=Race, y=meanAbsoluteAbundance, color=Race, fill=Race)) +
  geom_point(alpha=0.5, position = position_jitterdodge()) +
  geom_boxplot(color="black", alpha=0, outlier.shape = 0) +
  facet_grid(Cohort~var) +
  scale_x_discrete(labels=c("A", "B", "W", "O")) +
  theme(legend.position = "bottom") +
  labs(x=NULL, y="Absolute Abundance")

Session Info

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0      rstatix_0.7.0     cowplot_1.1.1     coin_1.4-2       
##  [5] survival_3.2-13   ggExtra_0.9       ggbeeswarm_0.6.0  vegan_2.5-7      
##  [9] lattice_0.20-45   permute_0.9-5     reshape2_1.4.4    formattable_0.2.1
## [13] broom_0.7.9       kableExtra_1.3.4  forcats_0.5.1     stringr_1.4.0    
## [17] dplyr_1.0.7       purrr_0.3.4       readr_2.0.2       tidyr_1.1.4      
## [21] tibble_3.1.5      ggplot2_3.3.5     tidyverse_1.3.1  
## 
## loaded via a namespace (and not attached):
##   [1] TH.data_1.1-0      colorspace_2.0-2   ggsignif_0.6.3    
##   [4] ellipsis_0.3.2     modeltools_0.2-23  fs_1.5.0          
##   [7] rstudioapi_0.13    farver_2.1.0       bit64_4.0.5       
##  [10] fansi_0.5.0        mvtnorm_1.1-3      lubridate_1.8.0   
##  [13] xml2_1.3.2         codetools_0.2-18   splines_4.1.1     
##  [16] libcoin_1.0-9      knitr_1.36         jsonlite_1.7.2    
##  [19] cluster_2.1.2      dbplyr_2.1.1       shiny_1.7.1       
##  [22] compiler_4.1.1     httr_1.4.2         backports_1.2.1   
##  [25] assertthat_0.2.1   Matrix_1.3-4       fastmap_1.1.0     
##  [28] cli_3.4.1          later_1.3.0        htmltools_0.5.2   
##  [31] tools_4.1.1        gtable_0.3.0       glue_1.4.2        
##  [34] Rcpp_1.0.7         carData_3.0-5      cellranger_1.1.0  
##  [37] jquerylib_0.1.4    vctrs_0.5.1        svglite_2.0.0     
##  [40] nlme_3.1-153       xfun_0.26          ps_1.6.0          
##  [43] rvest_1.0.1        mime_0.12          miniUI_0.1.1.1    
##  [46] lifecycle_1.0.3    MASS_7.3-54        zoo_1.8-10        
##  [49] scales_1.1.1       vroom_1.5.5        hms_1.1.1         
##  [52] promises_1.2.0.1   parallel_4.1.1     sandwich_3.0-1    
##  [55] yaml_2.2.1         sass_0.4.0         stringi_1.7.8     
##  [58] highr_0.9          rlang_1.0.6        pkgconfig_2.0.3   
##  [61] systemfonts_1.0.2  matrixStats_0.61.0 evaluate_0.14     
##  [64] labeling_0.4.2     htmlwidgets_1.5.4  processx_3.5.2    
##  [67] bit_4.0.4          tidyselect_1.1.1   plyr_1.8.6        
##  [70] magrittr_2.0.1     R6_2.5.1           magick_2.7.3      
##  [73] generics_0.1.0     multcomp_1.4-18    DBI_1.1.1         
##  [76] pillar_1.6.3       haven_2.4.3        withr_2.4.2       
##  [79] mgcv_1.8-38        abind_1.4-5        modelr_0.1.8      
##  [82] crayon_1.4.1       car_3.0-12         utf8_1.2.2        
##  [85] tzdb_0.1.2         rmarkdown_2.11     grid_4.1.1        
##  [88] readxl_1.3.1       callr_3.7.0        reprex_2.0.1      
##  [91] digest_0.6.28      webshot_0.5.2      xtable_1.8-4      
##  [94] httpuv_1.6.3       stats4_4.1.1       munsell_0.5.0     
##  [97] beeswarm_0.4.0     viridisLite_0.4.0  vipor_0.4.5       
## [100] bslib_0.3.1